我正在尝试从字符串或(段落)中删除英文单词。但问题是所有的词都没有被删除。但是,当我尝试使用 25 个单词的较小字符串时,它可以完美运行。我试图通过 3 个步骤过滤它:
- 从字符串中删除链接
- 删除 XML 标记。
- 删除英文单词。
下面是代码:
String SWList[];
public ArrayList<String> tokens = new ArrayList<String>();
String sentenceSoFar="";
String nextToken;
String withoutLink=null;
ArrayList<String> wordscount = new ArrayList<String>();
boolean flag=false;
String str;
int counter;
String finalStr="";
ArrayList<String> spaceCheck = new ArrayList<String>();
public void removeLinks(String str) {
sentenceSoFar=null;
String delims = " ";
StringTokenizer st = new StringTokenizer(str, delims);
sentenceSoFar=null;
while (st.hasMoreTokens()) {
String str1 = "http";
String nextToken = st.nextToken();
//System.out.println("LINK CHECK : " + nextToken);
if (!(nextToken.contains(str1))) {
flag = false;
if (counter == 0) {
tokens.add(nextToken);
sentenceSoFar += " " + nextToken;
withoutLink+= " " + nextToken;
} else {
if (nextToken.contains(str1)) {
withoutLink = nextToken;
counter=1;
}
}
}
}
//System.out.println("sentence wala : " + sentenceSoFar);
removeXmlTags(sentenceSoFar);
sentenceSoFar=null;
}
public void removeXmlTags(String strTags) {
//strTags=null;
String[] stopWords = new String[] {
"&nbsp;</p>",
" &nbsp;</p> ",
".</p>"," .</p> ",
"1??"><span>&nbsp;",
" 1??"><span>&nbsp; ",
"<p"," <p ",
" align="center"> ",
" align="center">",
";",
"&",
"/&",
"<",
" < ",
"_rdEdi",
" _rdEdi ",
"br",
" br ",
"gt",
" gt ",
"exLink",
" exLink ",
"link",
" link ",
">",
" > ",
"style",
" style ",
";/div& ",
"class",
" cestry ",
"-",
" - ",
"nb",
" nb ",
" a ",
"<p>",
" ",
";/b&",
",",
"/",
" It ",
" strong ",
" span ",
" Responsibilities ",
" bull ",
" amp ",
" b ",
" d ",
" e ",
" f ",
" g ",
" h ",
" i ",
" j ",
" k ",
" l ",
" m ",
" n ",
" o ",
" p ",
" q ",
" r ",
" s ",
" t ",
" u ",
" v ",
" w ",
" x ",
};
{
for (String stopword : stopWords) {
strTags = strTags.replaceAll("(?i)"+stopword, " ");
}
}
//System.out.println("OUTPUT STRING WITHOUT TAGS : " + strTags);
englishWords(strTags);
strTags=null;
}
public void englishWords(String strWords) {
finalStr=null;
String[] stopWords = new String[]{
" i " , " a " , " natural " , " and " , " if " ," your" ," about " , " an " , " are " , " as " , " at " , " be " , " by " ," was " ," leadership " ,
" com " , " for " , " from " , " how " , " in " , " is " , " it " , " not " , " of " , " on " , " or " , " that " , " the " , " this " , " to " , "lt","quot",
" what " , " when " , " where " , " who " , " will " , " with " , " the " , " www " ," role " ," provides" ," you " ,"&nbsp;</p> " ,"align","temp","tor",
" Inc." ," Inc." ," is " ," an " ," equal" ," equal " ," Opportunity" ," Opportunity " ," Employer" ," Employer " ," The " ," company" ," candidates" ,"center",
" company " ," its" ," affiliates" ," affiliates " ," recruit " ," hire " ," qualified " ," candidates" ," candidates " ," today " ," Facebook " ,
" without " ," without " ," regard" ," regard " ," to " ," race" ," race " ," religion" ," religion " ," color " ," color " , " sex " ," sexual " ,
" sexual " ," orientation " ," orientation " ," gender " ," gender " , " identity " ," identity " ," age " ," national " ," national " ," origin" ,
" origin " ," ancestry" ," ancestry " ," citizenship" ," citizenship " , " veteran" ," veteran" , " or " ," disability" ," disability " ," status" ,
" status " ," medical" ," medical " ," condition" ," condition " ," marital" ," marital " , " any" ," any " , " other" ," other " ," factor" ," factor " ,
" prohibited" ," prohibited " ," state " ," state " ," provincial" ," provincial " ," and " ," federal" ," federal " ," municipal" ," municipal " ,
" it " ," ul " ," LI " ," HR " ," div " ," it " ," ul " ," lt " , " sp " , " Nurse " ," join " ," our " ," Overview " ," specializes " ," highly " ," sampling " ,
" Description " ," Requirements " ," Intensive " ," Care " ," StartDate " ," ASAP " ," Available " ," Shifts " ," Exclusive " ," order " ," Serving " ,
" throughout " ," county " ," members " ," range " ," more " ," provide " ," Emergency " ," currently " ," customer " ," unparalleled " ," Spending" ,
" looking " ," Critical " ," Facility " ," boggling " ," entertainment " ," service " ," benefits " ," commitment " ," outdoor " ," comprehensive " ,
" settings " ," patient " ," exhilarating " ," interventions " ," environments " ," nurses " ," needs " ," travel " ," primary " ," see " ," experience" ,
" gas " ," transportation " ," machine " ," construction " ," mining " ," industries " ," detailed " ," corrective " ," action " ," both " ," management " ,
" management " ," Receiving " ," Inspection " ," verification " ," established " ," which " ," material " ," acceptance " ," measurement " ,
," training " ," Familiar " ," shipment " ,
" levels " ," drawings " ," knowledge " ," Recruiter " ," Recruiter: " ," long " ," short " ," years " ," opportunities " ," competition " ," until " ," Email " ," here " ," quot " ," replace " ," schedule " ," Flexible " ,
" these" ," can " ," manage " ," multiple " ," tasks " ," simultaneously " ," adapt " ," market " ," changes? " ," basic " ," qualifications " ," only " ,
" half " ," story " ," considering " ," 7 " ," eleven " ," right " ," choice " ," should " ," consider " ," they " ," possess " ," traits " ," most " ," common " ,
" successful " ," 7 " ," eleven " ," franchisees " ," can " ," train " ," supervise " ," employees? " ," willing " ," empower " ," them " ," delegate " ," them? " ,
" dedicated " ," operations " ," excellence? " ," do " ," focus " ," details? " ," committed " ," creating " ," managing " ," organization " ," effectively " ,
" recruits " ," trains " ," retains " ," motivates " ," people " ," do " ," have " ," desire " ," build " ," emental " ," me " ," through " ," execution " ," ability " ,
" programs " ," strategies? " ," do" ," have " ," food" ," can " , " aur " , " join "
};
for (String stopword : stopWords)
{
strWords = strWords.replaceAll("(?i)"+stopword, " ");
}
String delims = " , = ; : ' * % $ @ 0 - _ + ( ) .";
StringTokenizer st = new StringTokenizer(strWords, delims);
finalStr =null;
while(st.hasMoreTokens()) {
String ntoken = st.nextToken();
// System.out.println("LINK CHECK : " + ntoken);
tokens.add(ntoken);
finalStr += " "+ ntoken;
//withoutLink+= " " + nextToken;
}
// System.out.println("Different STRING : " + finalStr);
// new indexing.IndexAlgo().algoOne(finalStr);
finalStr=null;
}
如果有人可以帮助我提供一些更好的逻辑或代码,我将不胜感激。