-1

给定一个大文件,我们需要存储单词,以便可以在恒定时间内完成单词的搜索。另外,我们如何找到文件中出现频率最高的 10% 的单词?

到目前为止,我所取得的成就是通过 trie 实现搜索单词。请提出一些方法来找到 10% 最常用的单词。

  #include<iostream>
#include<cstdio>
using namespace std;
class Node
{
    public:
    char value;
    Node* right;
    Node* down;
    Node()
    {
        right=down=NULL;
    }
};
class Trie
{
    public:
    Node* head;
    Trie()
    {
        head=NULL;
    }
    void insert(string s);
    void search(string s);
};
void Trie::insert(string s)
{
    if(head==NULL)
    {
        Node* f=new Node();
        head=f;
        Node* temp=f;
        f->value=s[0];
        for(int i=1;i<s.length();i++)
        {
            Node* n=new Node();
            n->value=s[i];
            temp->down=n;
            temp=n;
            if(i==s.length()-1)
            n->down=NULL;
        }
    }
    else
    {
        Node* ptr=head;
        int i=0;
        while(1)
        {
            if(i==s.length())break;
            if(ptr->value==s[i])
            {
                i++;
                if(ptr->down)
                ptr=ptr->down;
                else
                {
                    Node* temp=new Node();
                    ptr->down=temp;
                    temp->value=s[i];
                    ptr=temp;
                }
            }
            else if(ptr->value!=s[i])
            {
                if(ptr->right)
                ptr=ptr->right;
                else
                {
                    Node*temp=new Node();
                    ptr->right=temp;
                    temp->value=s[i];
                    ptr=temp;
                }
            }
        }       
    }
}
void Trie::search(string s)
{
    Node* ptr=head;
    int i=0;
    while(1)
    {
        if(ptr->value==s[i])
        {
            //cout<<ptr->value<<endl;
            ptr=ptr->down;
            i++;
        }
        else if(ptr->value!=s[i])
        {
            ptr=ptr->right;
        }
        if(ptr==NULL)break;
    }

    if(i==s.length()+1)cout<<"String found\n";
    else cout<<"String not found\n";
}
int main()
{
    Trie t;
    FILE* input;
    char s[100];
    input=fopen("big.txt","r");
    int i=0;
    while(  (fgets(s,sizeof(s),input) ) !=NULL)
    {
        int i=0; int j=0;
        char str[47];
        while(s[i]!='\0')
        {
            if(s[i]==' ' || s[i+1]=='\0')
            {
                str[j]='\0';
                j=0;
                t.insert(str);
                i++;
                continue;
            }
            str[j]=s[i];
            j++;
            i++;
        }
    }


    t.search("Dates");
    //t.search("multinational");
    fclose(input);
}
4

4 回答 4

0

如果您使用一棵树,您将无法获得恒定的时间。您正在构建的二叉树具有对数时间复杂度。

如果可以建立索引,请考虑倒排索引。这仍然无法帮助您获得恒定的时间(无论如何我不知道您如何实现),但可以帮助您确定使用最多的单词是什么,因为对于每个单词,它都将位置存储在文件中词被发现。您实际上可以将其组合到您的树中。

于 2013-09-18T14:26:49.433 回答
0

哈希可以让您在恒定时间内查找单词。

您可能可以使用快速排序中使用的某种分区来查找文件中至少出现 10% 的单词。

于 2013-09-18T14:13:33.643 回答
0

显而易见的解决方案是将文件的内容存储在一些适当的 STL 容器中,例如std::set然后find()在该容器上运行。

如果您坚持手动执行此操作,则二叉树将随着您放入其中的数据越多而变得越来越慢。另外,您必须保持平衡。对于大量数据,带有链接的哈希表将是更有效的 ADT。

于 2013-09-18T14:14:18.257 回答
0

这是使用优先级队列、map 和 trie 的类似 c++ 代码。为简单起见,它从向量字符串中读取,但可以很容易地修改为从文件中读取单词。

//查找文件或流中的前K个频繁词,C++

//这是priority_queue的工作解决方案供您参考。

    #include <iostream>
    #include <vector>
    #include <queue>
    #include <unordered_map>
    using namespace std;

    #define K_TH 3


    class TrieNode;
    typedef struct HeapNode
    {
        string word;
        int frequency;
        HeapNode(): frequency(0), word(""){} ;
        TrieNode *trieNode;

    }HeapNode;


    class TrieNode
    {
        private:
            int frequency = 0;
            bool m_isLeaf = false;
            string word = "";
            unordered_map<char, TrieNode*> children;
            HeapNode *heapNode = NULL;

        public:
            TrieNode() {}
            TrieNode(char c)
            {
                children[c] = new TrieNode();
                this->m_isLeaf = false;
            }

            void setWord(string word)
            {
                this->word = word;
            }
            string getWord()
            {
                return this->word;
            }
            bool isLeaf(void)
            {
                return this->m_isLeaf;
            }
            void setLeaf(bool leaf)
            {
                this->m_isLeaf = leaf;
            }
            TrieNode* getChild(char c)
            {
                if (children[c] != NULL)
                    return children[c];
                return NULL;
            }
            void insert(char c)
            {
                children[c] = new TrieNode();
            }
            int getFrequency()
            {
                return this->frequency;
            }
            void setFrequency(int frequency)
            {
                this->frequency = frequency;
            }
            void setHeapNode(HeapNode *heapNode)
            {
                this->heapNode = heapNode;
            }
            HeapNode* getHeapNode()
            {
                return heapNode;
            }
            bool operator()(HeapNode* &a, HeapNode* &b)
            {
                return (a->frequency > b->frequency);
            }
    };

    class Trie
    {
        private:
            TrieNode *root = NULL;

        public:
            Trie()
            {
                if (!root)
                {
                    this->root = new TrieNode();
                }
            }
            TrieNode* insert(string word)
            {
                if (!root)
                    root = new TrieNode();
                TrieNode* current = root;
                int length = word.length();
                //insert "abc"
                for(int i = 0; i < length; ++i)
                {
                    if (current->getChild(word.at(i)) == NULL)
                    {
                        current->insert(word.at(i));
                    }
                    current = current->getChild(word.at(i));
                }
                current->setLeaf(true);
                current->setWord(word);
                current->setFrequency(current->getFrequency() + 1);
                return current;
            }
    };



    struct cmp
    {
        bool operator()(HeapNode* &a, HeapNode* &b)
        {
            return (a->frequency > b->frequency);
        }
    };
    typedef priority_queue<HeapNode*, vector<HeapNode*>, cmp > MinHeap;


    void insertUtils(Trie *root, MinHeap &pq, string word )
    {
        if (!root)
            return;

        TrieNode* current = root->insert(word);
        HeapNode *heapNode = current->getHeapNode();
        if(heapNode)// if word already present in heap 
        {
            heapNode->frequency += 1;
        }else if (pq.empty() || pq.size() < K_TH)
        {// if word not present in heap and heap is not full;
            heapNode = new HeapNode();
            heapNode->word = word;
            heapNode->frequency = 1;
            heapNode->trieNode = current;
            current->setHeapNode(heapNode);
            pq.push(heapNode);
        }else if (pq.top()->frequency < current->getFrequency())
        {   // if word is not present and heap is full;
            HeapNode *temp = pq.top();
            //remove first element and add current word
            pq.pop();
            delete temp;
            heapNode = new HeapNode();
            current->setHeapNode(heapNode);
            pq.push(heapNode);
        }
    }


    void printKMostFrequentWords(vector<std::string> input)
    {

        Trie *root = new Trie();
        MinHeap minHeap;
        for (vector<string>::iterator it = input.begin(); it != input.end(); ++it)
        {
            insertUtils(root, minHeap, *it);
        }

        while(!minHeap.empty())
        {
            HeapNode *heapNode = minHeap.top();
            cout << heapNode->word << ":" << heapNode->frequency << endl;
            minHeap.pop();
        }


    }

    int main() {

    vector<std::string>input( {
        "abc", "def", "ghi",
        "jkl", "abc", "def",
        "mno", "xyz", "abc"

    } ) ;
    printKMostFrequentWords(input);
    }
于 2018-12-15T22:38:39.797 回答