-2

我正在尝试从字符串或(段落)中删除英文单词。但问题是所有的词都没有被删除。但是,当我尝试使用 25 个单词的较小字符串时,它可以完美运行。我试图通过 3 个步骤过滤它:

  1. 从字符串中删除链接
  2. 删除 XML 标记。
  3. 删除英文单词。

下面是代码:

String SWList[];
public ArrayList<String> tokens = new ArrayList<String>();
String sentenceSoFar="";
String nextToken;
String withoutLink=null;
ArrayList<String>    wordscount = new ArrayList<String>();
boolean flag=false;
String str;
int counter;
String finalStr="";
ArrayList<String> spaceCheck = new ArrayList<String>();

public void removeLinks(String str) {
    sentenceSoFar=null;
    String delims = " ";
    StringTokenizer st = new StringTokenizer(str, delims);
    sentenceSoFar=null;
    while (st.hasMoreTokens()) {
        String str1 = "http";
        String nextToken = st.nextToken();
        //System.out.println("LINK CHECK : " + nextToken);

        if (!(nextToken.contains(str1))) {
            flag = false;
            if (counter == 0) {
                tokens.add(nextToken);
                sentenceSoFar += " " + nextToken;
                withoutLink+= " " + nextToken;
            } else {
                if (nextToken.contains(str1)) {
                    withoutLink = nextToken;
                    counter=1;
                }
            }
        }
    }

    //System.out.println("sentence wala :  " + sentenceSoFar);
    removeXmlTags(sentenceSoFar);
    sentenceSoFar=null;

}

public void removeXmlTags(String strTags) {
    //strTags=null;

    String[] stopWords = new String[] {
        "&amp;nbsp;&lt;/p&gt;",
        " &amp;nbsp;&lt;/p&gt; ",
        ".&lt;/p&gt;"," .&lt;/p&gt; ",
        "1??&quot;&gt;&lt;span&gt;&amp;nbsp;",
        " 1??&quot;&gt;&lt;span&gt;&amp;nbsp; ",
        "&lt;p"," &lt;p ",
        "  align=&quot;center&quot;&gt; ",
        " align=&quot;center&quot;&gt;",
        ";",
        "&",
        "/&",
        "&lt",
        " &lt ",
        "_rdEdi",
        " _rdEdi ",
        "br",
        " br ",
        "gt",
        " gt ",
        "exLink",
        " exLink ",
        "link",
        " link ",
        "&gt",
        " &gt ",
        "style",
        " style ",
        ";/div& ",
        "class",
        " cestry ",
        "-",
        " - ",
        "nb",
        " nb ",
        " a ",
        "&lt;p&gt;",
        "&#160;",
        ";/b&",
        ",",
        "/",
        " It ",
        " strong ",
        " span ",
        " Responsibilities ",
        " bull ",
        " amp ",
        " b ",
        " d ",
        " e ",
        " f ",
        " g ",
        " h ",
        " i ",
        " j ",
        " k ",
        " l ",
        " m ",
        " n ",
        " o ",
        " p ",
        " q ",
        " r ",
        " s ",
        " t ",
        " u ",
        " v ",
        " w ",
        " x ",
    };

    {
        for (String stopword : stopWords) {
            strTags = strTags.replaceAll("(?i)"+stopword, " ");
        }
    }

    //System.out.println("OUTPUT STRING WITHOUT TAGS : " + strTags);
    englishWords(strTags);
    strTags=null;
}

public void englishWords(String strWords) {

    finalStr=null;
    String[] stopWords = new String[]{
        " i " , " a " , " natural " , " and " , " if " ," your" ," about " , " an " , " are " , " as " , " at " , " be " , "  by " ," was " ," leadership " ,
        " com " , " for " , " from " , " how " , " in " , " is " , " it " , " not " , " of " , " on " , " or " , " that " , " the " , " this " , " to " , "lt","quot",
        " what " , " when " , " where " , " who " , " will " , " with " , " the " , " www " ," role " ," provides" ," you " ,"&amp;nbsp;&lt;/p&gt; " ,"align","temp","tor",
        " Inc." ,"  Inc." ," is " ," an " ," equal" ," equal " ," Opportunity" ," Opportunity " ," Employer" ,"  Employer " ,"  The " ," company" ," candidates" ,"center",
        " company " ," its" ," affiliates" ,"  affiliates " ,"  recruit " ,"  hire " ,"  qualified " ," candidates" ,"  candidates " ,"  today " ," Facebook " ,
        " without " ,"  without " ," regard" ,"  regard " ," to " ," race" ," race " ," religion" ," religion " ," color " ," color " , " sex " ," sexual " ,
        " sexual " ,"  orientation " ,"  orientation " ," gender " ," gender " , " identity " ," identity " ," age " ," national " ," national " ," origin" ,
        " origin " ," ancestry" ," ancestry " ," citizenship" ," citizenship " , " veteran" ," veteran" , "  or " ," disability" ,"  disability " ," status" ,
        " status " ,"  medical" ,"  medical " ," condition" ," condition " ," marital" ," marital " , " any" ,"  any " , " other" ,"  other " ," factor" ,"  factor " ,
        " prohibited" ,"  prohibited " ," state " ,"  state " ," provincial" ,"  provincial " ," and " ," federal" ,"  federal " ," municipal" ,"  municipal " ,
        " it " ," ul " ," LI " ," HR " ," div " ," it " ," ul " ," lt " , " sp " , " Nurse " ," join " ," our " ,"  Overview " ,"  specializes " ,"  highly " ," sampling " ,
        " Description " ," Requirements " ," Intensive " ," Care " ," StartDate " ," ASAP " ," Available " ," Shifts " ," Exclusive " ," order " ," Serving " ,
        " throughout " ,"  county " ," members " ," range " ," more " ,"  provide " ," Emergency " ," currently " ," customer " ,"  unparalleled " ,"  Spending" , 
        " looking " ," Critical " ," Facility " ,"  boggling " ," entertainment " ," service " ," benefits " ," commitment " ," outdoor " ," comprehensive " ,
        " settings " ," patient " ,"  exhilarating " ,"  interventions " ,"  environments " ," nurses " ," needs " ," travel " ,"  primary " ," see " ,"  experience" ,
        " gas " ,"  transportation " ,"  machine " ,"  construction " ," mining " ," industries " ," detailed " ," corrective " ,"  action " ,"  both " ," management " ,
        " management " ,"  Receiving " ,"  Inspection " ," verification " ," established " ," which " ," material " ," acceptance " ," measurement " ,
        ," training " ," Familiar " ,"  shipment " ,
        " levels " ,"  drawings " ," knowledge " ," Recruiter " ," Recruiter: " ,"  long " ," short " ," years " ," opportunities " ," competition " ,"  until " ," Email " ," here " ," quot " ," replace " ," schedule " ," Flexible " ,
        " these"  ," can " ,"  manage " ,"  multiple " ," tasks " ," simultaneously " ," adapt " ," market " ," changes? " ," basic " ," qualifications " ," only " ,
        " half " ," story " ,"  considering " ," 7 " ," eleven " ," right " ," choice " ," should " ," consider " ," they " ," possess " ,"  traits " ," most " ," common " ,
        " successful " ," 7 " ,"  eleven " ,"  franchisees " ," can "  ,"  train " ,"  supervise " ," employees? " ," willing " ," empower " ,"  them " ," delegate " ," them? " ,
        " dedicated " ," operations "  ," excellence? " ," do " ,"  focus " ," details? " ," committed " ," creating " ," managing " ," organization " ,"  effectively " ,
        " recruits " ,"  trains " ,"  retains " ,"  motivates " ," people " ," do " ,"  have " ,"  desire " ,"  build " ," emental " ," me " ,"  through " ," execution " ," ability " ,
        " programs " ,"  strategies? " ,"  do" ,"  have " ,"  food" ," can " , " aur " , " join "
    };

    for (String stopword : stopWords)
    {
        strWords = strWords.replaceAll("(?i)"+stopword, " ");
    }

    String delims = " , = ; : ' * % $ @ 0 - _ + ( ) .";
    StringTokenizer st = new StringTokenizer(strWords, delims);

    finalStr =null;
    while(st.hasMoreTokens()) {

        String ntoken = st.nextToken();
        //    System.out.println("LINK CHECK : " + ntoken);

        tokens.add(ntoken);
        finalStr += " "+ ntoken;
        //withoutLink+= " " + nextToken;

    }

    //    System.out.println("Different  STRING : " + finalStr);
    //    new indexing.IndexAlgo().algoOne(finalStr);
    finalStr=null;
}

如果有人可以帮助我提供一些更好的逻辑或代码,我将不胜感激。

4

1 回答 1

0

第一种和第三种方法看起来应该可以工作,尽管它们奇怪的代码行毫无意义。但是,我还没有真正深入研究过第二个,我猜是因为我对常规扩展的能力并不强。我认为您应该/p与 unicode 属性一起使用。我不知道\p这里的表达应该是什么意思。

我不确定您使用的是哪种输入以及您的结果是什么。为什么你认为你的代码不起作用?

于 2013-08-29T21:39:31.220 回答