0

我正在创建一个小型应用程序,它将打开一个 word 文档,扫描它以获取信用卡号(不同的模式),替换文本,保存并关闭文档。

我的代码相当简单:

using System;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using Word = Microsoft.Office.Interop.Word;

namespace ParseFilesAndRemoveRegExp
{
    class Program
    {
        static void Main(string[] args)
        {
            FileManagement m = new FileManagement();
            m.OpenSearchAndReplace();
        }
    }

    class FileManagement
    {
        Word.Application wordapp;

        public FileManagement()
        {
            try
            {
                wordapp = new Word.Application();
            }
            catch(Exception ex)
            {
                if (ex != null)
                {
                    string s = ex.ToString();
                }
            }
        }

        internal void OpenSearchAndReplace()
        {
            object nullobj = System.Reflection.Missing.Value;
            try
            { 
                object filename = @"c:\\temp\\document.docx";
                object replaceAll = Word.WdReplace.wdReplaceAll;

                object matchWildCards = true;
                object readOnly = false;
                object isVisible = false;

                Word.Document doc = wordapp.Documents.Open( ref filename, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, 
                                                            ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                                            ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                doc.Activate();
                wordapp.Selection.Find.ClearFormatting();

                //wordapp.Selection.Find.Text = "[0-9]{16}";
                wordapp.Selection.Find.Text = "\b(?:[0-9][ -]*?){13,16}\b";
                wordapp.Selection.Find.Replacement.ClearFormatting();
                wordapp.Selection.Find.Replacement.Text = "---Cardnumber automatically removed---";

                wordapp.Selection.Find.Execute(ref nullobj, ref nullobj, ref nullobj, ref matchWildCards,
                                    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                    ref replaceAll, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                doc.Save();
            }
            catch(Exception ex)
            {
                string s = ex.ToString();
                if( wordapp != null )
                {
                    //wordapp.Documents.Close( ref nullobj, ref nullobj, ref nullobj );
                    wordapp.Quit( ref nullobj, ref nullobj, ref nullobj );
                }
            }
        }
    }
}

但是 - 运行它时出现异常:“System.Runtime.InteropServices.COMException (0x800A15B8): Find What 文本包含无效的模式匹配表达式”。

我认为这可能与我发送到 Word 的字符有关,因此我之前将 \d 与 [0-9] 交换。但没有变化。如果我使用 [0-9]{16} 运行,它会将 1234567891012345 替换为我要使用的字符串。

有谁可以帮我离开这里吗?我是否必须使用许多不同的正则表达式来搜索来管理文档,或者这可以通过一个简单的正则表达式来完成,就像我已经拥有的一样?

4

5 回答 5

3

尝试\\b代替\b. 否则,字符串解析器将尝试将 ascii 代码007(bell) 放入字符串中,而您将得不到匹配。

于 2010-02-10T08:45:09.220 回答
1

以非常简单的方式做这件事给了我一些有用的东西:

for (int i = 0; i < 3; ++i)
            { 
                if( i == 0 )
                    wordapp.Selection.Find.Text = "[0-9]{16}";
                else if( i == 1 )
                    wordapp.Selection.Find.Text = "[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}";
                else if( i == 2 )
                    wordapp.Selection.Find.Text = "[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}";

                wordapp.Selection.Find.Execute( ref nullobj, ref nullobj, ref nullobj, ref matchWildCards,
                                                ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                                ref replaceAll, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
            }

这不是一个很好的设置,但是嘿 - 它有效。删除了 XXXXXXXXXXXXXXXX、XXXX XXXX XXXX XXXX 和 XXXX-XXXX-XXXX-XXXX 等数字。如有必要,我会添加其他人。

于 2010-02-10T09:44:21.620 回答
1

你试过逃跑吗?:

wordapp.Selection.Find.Text = @"\b(?:[0-9][ -]*?){13,16}\b"; 

如果这不起作用,您需要从一个简单的正则表达式(或者实际上只是一个纯文本单词)开始,验证它是否有效,然后分阶段构建正则表达式。

于 2010-02-10T08:49:46.407 回答
0

My guess would be that Word has its own flavour of regex. Have you tried opening a document in Word and using that regex in the Find and Replace dialog?

Actually, according to http://www.regexinference.com/documentation/Microsoft-Word-Wildcards-as-Regular-Expressions.html, Word doesn't support non-capturing parenthesis, so you're going to have to come up with a different solution.

于 2010-02-10T09:29:50.143 回答
0

我们有以下作为迄今为止超越单行的最佳解决方案。这不是 ms 字,但你可以肯定得到你想要的。

private const string _creditCardPatternMatchingExpression = @"(?m:-[*]\w{2}\d{15,16})|(?m:CC\w{2}\d{15,16})|(?m:\d{15,16})|(\d{4}-\d{4}-\d{4}-\d{4})|(\d{4}-\d{6}-\d{5})";

        public static string CleanCreditCardData(this String contentThatMayHaveCreditCardData)
    {
        string initiallyCleanedUpData = Regex.Replace(contentThatMayHaveCreditCardData, _creditCardPatternMatchingExpression, "CCXXXXXXXXXXXXXX");
        string completeSpaceEnterCleanedUpVersion = initiallyCleanedUpData.ToLower().Replace("\r\n", "").Replace("\n", "").Replace(" ", "").Replace("-", "").Replace("<br>", "").Replace("<br />", "").Replace("<br/>", "").Replace("&nbsp;", "");
        if (Regex.IsMatch(completeSpaceEnterCleanedUpVersion,_creditCardPatternMatchingExpression))
            return Regex.Replace(completeSpaceEnterCleanedUpVersion, _creditCardPatternMatchingExpression, "CCXXXXXXXXXXXXXX");

        return initiallyCleanedUpData;
    }
于 2011-08-22T20:10:24.660 回答