我有一个问题,我需要将输入字符串拆分为可能的Prefix
,Stem
和Suffix
.
规则是:
Prefix
= 0-4 个字符
Stem
= 1-* 个字符
Suffix
= 0-6 个字符。
假设我输入了“wbAlErbyp”,因此需要对其进行拆分(示例不完整):
有什么想法可以实现这一目标吗?
编辑 1:
好的,这是我的旧解决方案(它很长且不专业),我不再理解它并想重新设计。
public static List<string> GetMatches(string inputTextArabic)
{
// store matches/results here
List<string> results = new List<string>();
char[] arabicChars = inputTextArabic.ToCharArray();
// convert all the arabic chars from array
// into latin chars in array
string latString = "";
for (int i = 0; i < arabicChars.Length; i++)
{
switch (arabicChars[i])
{
#region ARABIC TO LATIN TABLE
case 'ا':
latString += "A";
break;
case 'آ':
latString += "|";
break;
case 'ؤ':
latString += "&";
break;
case 'ئ':
latString += "}";
break;
case 'أ':
latString += ">";
break;
case 'إ':
latString += "<";
break;
case 'ء':
latString += @"\";
break;
case 'ب':
latString += "b";
break;
case 'ت':
latString += "t";
break;
case 'ة':
latString += "p";
break;
case 'ث':
latString += "v";
break;
case 'ج':
latString += "j";
break;
case 'ح':
latString += "H";
break;
case 'خ':
latString += "x";
break;
case 'د':
latString += "d";
break;
case 'ذ':
latString += "*";
break;
case 'ر':
latString += "r";
break;
case 'ز':
latString += "z";
break;
case 'س':
latString += "s";
break;
case 'ش':
latString += "$";
break;
case 'ص':
latString += "S";
break;
case 'ض':
latString += "D";
break;
case 'ط':
latString += "T";
break;
case 'ظ':
latString += "Z";
break;
case 'ع':
latString += "E";
break;
case 'غ':
latString += "g";
break;
case 'ـ':
latString += "_";
break;
case 'ف':
latString += "f";
break;
case 'ق':
latString += "q";
break;
case 'ك':
latString += "k";
break;
case 'ل':
latString += "l";
break;
case 'م':
latString += "m";
break;
case 'ن':
latString += "n";
break;
case 'ه':
latString += "h";
break;
case 'و':
latString += "w";
break;
case 'ى':
latString += "Y";
break;
case 'ي':
latString += "y";
break;
case 'ً':
latString += "F";
break;
case 'ٌ':
latString += "N";
break;
case 'ٍ':
latString += "K";
break;
case 'َ':
latString += "a";
break;
case 'ُ':
latString += "u";
break;
case 'ِ':
latString += "i";
break;
case 'ّ':
latString += "~";
break;
case 'ْ':
latString += "o";
break;
#endregion
}
}
// loop thru different stem sizes
// stem is 1-*
int lenWord = latString.Length;
for (int lenStem = 1; lenStem <= lenWord; lenStem++)
{
// set max prefix size, strd is 4 but could be
// less depending on word size
int lenPrefMax = 4;
if (lenWord - lenStem < lenPrefMax)
{
lenPrefMax = lenWord - lenStem;
}
// loop thru different prefix sizes
// based on the max above
for (int lenPref = 0; lenPref <= lenPrefMax; lenPref++)
{
// set suffix max, std is 6, but could be
// less depending on word size
int lenSuffMax = 6;
if (lenWord - lenStem - lenPref < lenSuffMax)
{
lenSuffMax = lenWord - lenStem - lenPref;
}
// loop thru different suffix sizes
// based on the max above
for (int lenSuff = 0; lenSuff <= lenSuffMax; lenSuff++)
{
// if sum of parts doesnt equal word size
// it means its not a proper match, thus skip
if (lenPref + lenStem + lenSuff < lenWord)
continue;
// otherwise, these are the possible word bits
string prefix = latString.Substring(0, lenPref);
string stem = latString.Substring(lenPref, lenStem);
string suffix = latString.Substring(lenPref + lenStem, lenSuff);
// now see if they all exist in the relevant places
List<WordBit> prefMatches = (from x in prefixes where x.NoVowels == prefix select x).Distinct().ToList();
List<WordBit> stemMatches = (from x in stems where x.NoVowels == stem select x).Distinct().ToList();
List<WordBit> suffMatches = (from x in suffixes where x.NoVowels == suffix select x).Distinct().ToList();
if (!(prefMatches.Count > 0 && stemMatches.Count > 0 && suffMatches.Count > 0))
break;
// Now that they are found, see if they go together
// For each prefix, loop through every stem
foreach(WordBit prefMatch in prefMatches)
{
// for each stem, loop through all suffixes
foreach (WordBit stemMatch in stemMatches)
{
// Now we know there is a prefix, suffix and stem
foreach (WordBit suffMatch in suffMatches)
{
// get their types
string prefType = prefMatch.Type;
string stemType = stemMatch.Type;
string suffType = suffMatch.Type;
// find out if the types are compatible
bool prefStemConnects = (from x in prefixStemConns where x.Type1 == prefType && x.Type2 == stemType select x).Count() > 0;
bool stemSuffConnects = (from x in stemSuffixConns where x.Type1 == stemType && x.Type2 == suffType select x).Count() > 0;
bool prefSuffConnects = (from x in prefixSuffixConns where x.Type1 == prefType && x.Type2 == suffType select x).Count() > 0;
// they all connect
// we have found a match!
if (prefStemConnects && stemSuffConnects && prefStemConnects)
{
Match match = new Match();
//match.MatchMeaning = "";
// 1. prefix
// 2. stem
// 3.
//takeplusesout
match.MatchMeaning = match.RootMeaning = Regex.Match(stemMatch.Extra, @"^.*?(?=\s\s|$)").ToString();
// [fem.sg.] = I
match.SuffixInfo = Regex.Match(suffMatch.Extra, @"^.*?(?=\s\s)").ToString();
if (match.SuffixInfo != "")
{
if (match.SuffixInfo.Contains("<verb>"))
{
match.MatchMeaning = match.SuffixInfo.Replace("<verb>", match.RootMeaning);
match.SuffixInfo = "";
}
else
{
match.MatchMeaning = match.MatchMeaning + " " + match.SuffixInfo;
}
}
// Get 1st part of prefix
match.PrefixInfo = Regex.Match(prefMatch.Extra, @"^.*?(?=\s|\s\s|$)").ToString();
if (match.PrefixInfo != "")
{
match.MatchMeaning = match.PrefixInfo + " " + match.RootMeaning + " " + match.SuffixInfo;
}
//results.Add(prefMatch.Extra + "--" + stemMatch.Extra + "--" + suffMatch.Extra);
//if (beforeMeaning != "")
// beforeMeaning += " ";
results.Add(match.MatchMeaning);
Debug.Print("_____________________________________________________________________________________");
Debug.Print(prefMatch.NoVowels + "\t\t" + prefMatch.Vowels + "\t\t" + prefMatch.Type + "\t\t" + prefMatch.Extra);
Debug.Print(stemMatch.NoVowels + "\t\t" + stemMatch.Vowels + "\t\t" + stemMatch.Type + "\t\t" + stemMatch.Extra);
Debug.Print(suffMatch.NoVowels + "\t\t" + suffMatch.Vowels + "\t\t" + suffMatch.Type + "\t\t" + suffMatch.Extra);
Debug.Print("______________________________________________________________________________________");
}
}
}
}
}
}
}
return results;
}