c++ - 使用 C++ 输入的 Unicode 字符的索引和直方图

Question

计算每个符号的出现次数以及它们在文本、单词或行中出现的位置我有一个类似许多语言的单词列表。

我正在尝试做的是计算每个字符的出现次数以及它们在文本中的位置或常见位置。如果有可能计算音节的常见数量也会有帮助。

sommige
disa
بَعْض - ba'th
mi qani - մի քանի
bəzi
batzuk
nyeykі/nyeykaya/nyeykaye/nyeykіya - нейкі/нейкая/нейкае/нейкія
kisu - কিসু
afouhe - بعض
neki
alguns
njakoj - някой
一些
algú/alguns/alguna/algunes
neki
někteří
nogle
berekhey āz - برخی از
een paar
kam - كام
some
iuj
mõned
berekhey āz - برخی از
ilan
joitakin
sommige
certains
algúns
ramdenime - რამდენიმე
einige
peripou - περίπου
keṭelāk - કેટલાક 
wasu
kèk
khemeh - כמה
kuch - कुछ
néhány
sumir
beberapa
roinnt
alcuni
ikutsu ka no - いくつかの
kelavu
មួយចំនួន
조금 - jo geum
هەندێک
aliquis
daži
keli
nekoi - некои
misy
beberapa
ഏതാനും
xi
yī xiē  - 一些
kaahi - कांही
neki
shwiya - بعض
kehi - केही
enkelte
gari
berekhey āz - برخی از
b'eda - بعضی
kilka
ਕਈ
alguns
câţiva/câteva
некоторые - nekotorыe

some
neki - неки
samahara - සමහර
niektorí
nekaj
algunos
baadhi
några
ilan
yakchand - якчанд
konjam - கொஞ்சம்
yan
konni - కొన్ని
บาง - baang
bazı
dejakі - деякі
chened - چند
ba'zi, qandaydir
một số
rhai
עטלעכע
die
okumbalwa

这是当前的代码 sehe 使它与 unicode 一起使用

//#define PREFER_BOOST
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#ifdef PREFER_BOOST
#include <boost/locale.hpp>
#endif

using namespace std;

std::map<wchar_t, int> letterCount;
struct Counter
{
    void operator()(wchar_t  item) 
    { 
        if ( !std::isspace(item) )
            ++letterCount[std::tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
};

int main()
{
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("input.txt");

#ifdef PREFER_BOOST 
    boost::locale::generator gen;
    std::locale loc = gen("en_US.UTF-8"); 
#else
    std::locale loc("en_US.UTF-8");
#endif
    input.imbue(loc);
    wcout.imbue(loc);

    istreambuf_iterator<wchar_t> start(input), end;
    std::for_each(start, end, Counter());

    for (std::map<wchar_t, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
    {
        wcout << it->first <<" : "<< it->second << endl;
    }
}

这是我的原始代码

 #include <iostream>
  #include <cctype>
 #include <fstream>
#include <string>
 #include <map>
  #include <istream>
   #include <vector>
 #include <list>
 #include <algorithm>
#include <iterator>


using namespace std;
 struct letter_only: std::ctype<char> 
 {
    letter_only(): std::ctype<char>(get_table()) {}

    static std::ctype_base::mask const* get_table()
    {
       static std::vector<std::ctype_base::mask> 
             rc(std::ctype<char>::table_size,std::ctype_base::space);

       std::fill(&rc['A'], &rc['z'+1], std::ctype_base::alpha);
       return &rc[0];
    }
 };

struct Counter
{
    std::map<char, int> letterCount;
    void operator()(char  item) 
    { 
       if ( item != std::ctype_base::space)
         ++letterCount[tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
    operator std::map<char, int>() { return letterCount ; }
};

int main()
{
     ifstream input;
     input.imbue(std::locale(std::locale(), new letter_only())); //enable reading only leters only!
     input.open("T");
     istream_iterator<char> start(input);
     istream_iterator<char> end;
     std::map<char, int> letterCount = std::for_each(start, end, Counter());
     for (std::map<char, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
     {
          cout << it->first <<" : "<< it->second << endl;
     }
 }

我试图得到的例子

к : 10 (2,5) (1,5,8) (2,7) (1,3,5)

找到的字母 K 然后找到它的出现次数 10 然后在每个单词中找到它的位置，如前所述。

score 2 · Accepted Answer

这就是我得到的，它似乎在我的机器¹上运行良好。

//#define PREFER_BOOST
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#ifdef PREFER_BOOST
#include <boost/locale.hpp>
#endif

using namespace std;

std::map<wchar_t, int> letterCount;
struct Counter
{
    void operator()(wchar_t  item) 
    { 
        if ( !std::isspace(item) )
            ++letterCount[std::tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
};

int main()
{
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("input.txt");

#ifdef PREFER_BOOST 
    boost::locale::generator gen;
    std::locale loc = gen("en_US.UTF-8"); 
#else
    std::locale loc("en_US.UTF-8");
#endif
    input.imbue(loc);
    wcout.imbue(loc);

    istreambuf_iterator<wchar_t> start(input), end;
    std::for_each(start, end, Counter());

    for (std::map<wchar_t, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
    {
        wcout << it->first <<" : "<< it->second << endl;
    }
}

^{如果您更喜欢 boost locale 库，则需要链接到 boost_system、boost_locale 和 boost_thread；我没有看到任何明显的行为差异}

输出：

' : 3 , : 1 - : 32 / : 10 a : 67 b : 16 c : 7
d : 12 e : 61 f : 1 g : 16 h : 17 i : 46 j : 8
k : 41 l : 19 m : 19 n : 47 o : 20 p : 5 q : 3
r : 18 s : 21 t : 12 u : 21 v : 3 w : 3 x : 2
y : 21 z : 7 á : 1 â : 2 å : 1 è : 1 é : 1
í : 2 õ : 1 ú : 2 ā : 4 ē : 1 ě : 1 ī : 1
ı : 1 ř : 1 ţ : 1 ž : 1 ə : 1 ί : 1 ε : 1
ο : 1 π : 2 ρ : 1 υ : 1 а : 3 д : 2 е : 10
и : 2 й : 5 к : 10 н : 9 о : 4 р : 1 т : 1
ч : 1 ы : 2 я : 5 і : 6 ա : 1 ի : 2 մ : 1
ն : 1 ք : 1 ה : 1 ט : 1 כ : 2 ל : 1 מ : 1
ע : 3 ا : 4 ب : 7 خ : 3 د : 2 ر : 3 ز : 3
ض : 4 ع : 4 ك : 1 م : 1 ن : 2 ه : 1 َ : 1
 : 1 چ : 1 ک : 1 ی : 4 ێ : 1 ە : 1 ं : 1
ी : 2 ु : 1 े : 1 ক : 1 ি : 1 ু : 1 ਕ : 1
ક : 2 ટ : 1 ે : 1 க : 1 ் : 2 క : 1 ి : 1
 : 1 ഏ : 1 ു : 1 ර : 1 ස : 1 ง : 1 า : 1
ა : 1 დ : 1 ე : 2 ი : 1 მ : 2 ნ : 1 რ : 1
ច : 1 ន : 2 ម : 1 យ : 1 ួ : 2 ំ : 1 ố : 1
ộ : 1 い : 1 か : 1 く : 1 の : 1 一 : 2 些 : 2
금 : 1

¹ . 我可能无法显示所有字符，但这可能是由于我的终端字体。

score 2 · Accepted Answer

这是我要做的，在所有其他人的帮助下，谢谢。

#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#include <sstream>
using namespace std;

struct op {
    int O;
    wstring P;
};

template <typename T>
wstring NumberToString ( T Number )
{
    wstringstream ss;
    ss << Number;
    ss<<",";
    return ss.str();
}

std::map<wchar_t, struct op> letterCount;
void Counter(wchar_t  item) {
    if ( !std::isspace(item) ) {
        ++letterCount[std::tolower(item)].O;    //remove tolower if you want case-sensitive solution!
    }
}

int main()
{
    wchar_t * cline;
    wstring line;
    wchar_t const* B = L"(";
    wchar_t const* E = L")";
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("T");
    std::locale loc("en_US.UTF-8");
    input.imbue(loc);
    wcout.imbue(loc);

    if (input.is_open()) {
        while ( !input.eof() ) {
            wstring check;
            getline (input,line);
            wcout << line << endl;
            cline = new wchar_t [line.size()+1];
            wcscpy (cline, line.c_str());

            for (int i = 0; i  < line.size()+1; ++i) {
                Counter(cline[i]);
                if(check.find(cline[i]) == string::npos)
                    for (int j=0; j<line.size()+1; j++) {
                        if (j == 0) {
                            letterCount[std::tolower(cline[i])].P+= B;
                        }
                        if (j == line.size()) {
                            letterCount[std::tolower(cline[i])].P+= E;
                        }
                        if(cline[i]==cline[j]) {
                            letterCount[std::tolower(cline[i])].P+= NumberToString(j) ;
                            check +=cline[i];
                        }
                    }
            }
        }
        input.close();
    }

    for (std::map<wchar_t, struct op>::iterator it = letterCount.begin(); it != letterCount.end(); ++it) {
        wcout << it->first <<" : "<< it->second.O << "@" << it->second.P<< endl;
    }
}

输出：

н : 9@(36,42,49,56,)(9,)(8,)(0,)(7,)(15,)
о : 4@(12,)(11,)(3,5,)
р : 1@(6,)
т : 1@(4,)
ч : 1@(13,)
ы : 2@(7,19,)
я : 5@(47,61,)(10,)(11,)(11,)
і : 6@(5,30,40,60,)(5,13,)
ա : 1@(14,)
ի : 2@(11,16,)
մ : 1@(10,)
ն : 1@(15,)
ք : 1@(13,)

score 1 · Accepted Answer

#include <iostream>
#include <stdio>
#include <conio>

main()
{
 char name[1000],temp[1000];
 int i,j,n,present,count=0;
 clrscr();
 cout<<"Enter your char length:-";
 cin>>n;
 cout<<"\nEnter your text below:-\n\n";

 //get the text
 for(i=0;i<n;i++)
 {
  name[i]=getchar();
  temp[i]='\0';        //clear temp array
 }

 //extracting unique characters to temp[]
 temp[0]=name[0];
 for(i=1;i<n;i++)
 {
    present=0;
    for(j=0;j<strlen(temp);j++)
    {
      if(name[i]==temp[j])
      {
        present=1;
        break;
      }
    }
    if(present==0)
    {
     count++;
     temp[count]=name[i];
    }

 }

//counting char occurance
for (i=0;i<strlen(temp); i++)
{
   int count=0;
   cout<<"\n(";
   for (int j=0;j<n; j++)
   {
      if(temp[i]==name[j])
      {
        count++;
        cout<<j<<",";
      }
   }
   cout<<")\t\t"<<temp[i]<<":"<<count;
}
getch();
}

score 0 · Accepted Answer

您应该研究霍夫曼编码：NIST on Huffman Coding

这样，您不仅会得到所有出现的字母，还会得到常见的音节数（如果我理解正确的话）。

霍夫曼算法通常用于压缩和搜索树，但它会解决您的驱动问题（因为这正是霍夫曼所做的）。

Codeproject 上已经有一个可用的 C++ 实现：Huffman in C++ 你只需要它的一部分，因为你可能对压缩不感兴趣。

c++ - 使用 C++ 输入的 Unicode 字符的索引和直方图

4 回答 4

Related

Reference