-4

NOT x NOT y NOT z我也尝试以这些方式回答一些简单的布尔查询,x AND y AND z并且像这样x OR y OR z x,y,z的一些单词,它们中的任何一个属于不同的file.txt或者它们都属于相同的file.txt,无论如何。

我写了一个class TermDocMatrix

它必须能够回答布尔查询,我为此准备了一些方法,class TermDocMatrix{ }但它不起作用。我什至一步一步调试代码,我意识到循环没有转。我不知道为什么,代码似乎很好。

那么你可以在这里看到代码:

class TermDocMatrix
{
    //stores distinct terms
    public HashSet<string> distinctTerm = new HashSet<string>();
    //stores document id and its contents without splitting
    public Dictionary<int, string> documentContentList = new Dictionary<int, string>();
    //stores document and its terms collection
    public Dictionary<string, List<string>> documentCollection = new Dictionary<string, List<string>>();
    public Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();

     //stop words collection
    public List<string> stopWords = new List<string> { "on", "of", "The", "an", "a", "in" };
    //boolean operators list
    public string[] booleanOperator = new string[] { "AND", "OR", "NOT" };

    private string _FileName = "words";
    public string _Path = "";
    int _lastDocNum = 0;

    public TermDocMatrix(string IndexPath,string FileName)
    {

        if (_Path.EndsWith("\\") == false) _Path += "\\";
        if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
        LogManager.Configure(_Path + _FileName + ".txt", false);
        // read all files
        LoadFiles();
    }

    private void LoadFiles()
    {
        int count = 0;

        if (File.Exists(_Path + _FileName + ".txt") == false)
            return;
        // load words
        string b = File.ReadAllText(_Path + _FileName + ".txt");
        String[] TermsCollection = RemoveStopsWords(b.ToUpper().Split(' '));
        foreach (string term in TermsCollection)
        {
            //prepeare distinct terms collection
            //remove stop words
            if (!stopWords.Contains(term))
            {
                distinctTerm.Add(term);
            }
        }
        //add document and their terms collection
        documentCollection.Add(_FileName, TermsCollection.ToList());
        //add document and its content for displaying the search result
        documentContentList.Add(count, b);
        count++;
    }
    public string ProcessFiles(string query)
    {
        termDocumentIncidenceMatrix = GetTermDocumentIncidenceMatrix(distinctTerm, documentCollection);
        do
        {
            List<int> lst = ProcessQuery(query);
            int count = 0;
            if (lst != null)
            {
                foreach (int a in lst)
                {
                    if (a == 1)
                    {
                         return documentContentList[count];
                    }
                    count++;
                }
            }
            else
            {
                return "No search result found";
            }

        } while (1 == 1);
    }
    public int WordCount()
    {
        return documentCollection.Count;
    }

    public int DocumentCount
    {
        get
        {
            return _lastDocNum;
        }
    }

    private void FilterQueryTerm(ref string[] str)
    {
        List<string> _queryTerm = new List<string>();


        foreach (string queryTerm in str)
        {
            if (queryTerm.ToUpper().Equals("BUT") || termDocumentIncidenceMatrix.ContainsKey(queryTerm.ToUpper()) || booleanOperator.Contains(queryTerm))
            {
                _queryTerm.Add(queryTerm);

            }
        }

        str = _queryTerm.ToArray();
    }

    //prepares Term Document Incidence Matrix
    public Dictionary<string, List<int>> GetTermDocumentIncidenceMatrix(HashSet<string> distinctTerms, Dictionary<string, List<string>> documentCollection)
    {
        Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();
        List<int> incidenceVector = new List<int>();
        foreach (string term in distinctTerms)
        {
            //incidence vector for each terms
            incidenceVector = new List<int>();
            foreach (KeyValuePair<string, List<string>> p in documentCollection)
            {

                if (p.Value.Contains(term))
                {
                    //document contains the term
                    incidenceVector.Add(1);

                }
                else
                {
                    //document do not contains the term
                    incidenceVector.Add(0);
                }
            }
            termDocumentIncidenceMatrix.Add(term, incidenceVector);

        }
        return termDocumentIncidenceMatrix;
    }
    //removes all stop words
    public string[] RemoveStopsWords(string[] str)
    {
        List<string> terms = new List<string>();
        foreach (string term in str)
        {
            if (!stopWords.Contains(term))
            {
                terms.Add(term);
            }
        }
        return terms.ToArray();
    }
    //process the boolean query
    public List<int> ProcessQuery(string query)
    {

        //query boolean operator
        string bitWiseOp = string.Empty;
        string[] queryTerm = RemoveStopsWords(query.ToUpper().Split(' '));

        //remove query term that doesnot appears on document collection
        FilterQueryTerm(ref queryTerm);
        List<int> previousTermIncidenceV = null;
        List<int> nextTermsIncidenceV = null;
        //holds the bitwise operation result
        List<int> resultSet = null;
        //suppose on query X AND Y, X is previousTerm term and Y is nextTerm
        Boolean hasPreviousTerm = false;
        Boolean hasNotOperation = false;
        foreach (string term in queryTerm)
        {
            //is a term
            if (!booleanOperator.Contains(term) && !term.Equals("BUT"))
            {
                //query case: structure AND NOT analysis
                if (hasNotOperation)
                {

                    if (hasPreviousTerm)
                    {
                        nextTermsIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
                    }
                    //query case: eg.NOT analysis
                    else
                    {
                        previousTermIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
                        resultSet = previousTermIncidenceV;
                    }
                    hasNotOperation = false;
                }
                else if (!hasPreviousTerm)
                {
                    previousTermIncidenceV = GetTermIncidenceVector(term);
                    resultSet = previousTermIncidenceV;
                    hasPreviousTerm = true;
                }
                else
                {

                    nextTermsIncidenceV = GetTermIncidenceVector(term);
                }
            }
            else if (term.Equals("NOT"))
            {
                //indicates that the  term in the next iteration should be complemented.
                hasNotOperation = true;
            }
            else
            {
                //'BUT' also should be evaluated as AND eg. structure BUT NOT semantic should be evaluated as structure AND NOT semantic
                if (term.Equals("BUT"))
                {
                    bitWiseOp = "AND";
                }
                else
                    bitWiseOp = term;
            }

            if (nextTermsIncidenceV != null && !hasNotOperation)
            {
                resultSet = ProcessBooleanOperator(bitWiseOp, previousTermIncidenceV, nextTermsIncidenceV);
                previousTermIncidenceV = resultSet;
                hasPreviousTerm = true;
                nextTermsIncidenceV = null;
            }
        }

        return resultSet;
    }

    //Process Boolean operators
    public List<int> ProcessBooleanOperator(string op, List<int> previousTermV, List<int> nextTermV)
    {
        List<int> resultSet = new List<int>();
        if (op.Equals("NOT"))
        {
            foreach (int a in previousTermV)
            {
                if (a == 1)
                {
                    resultSet.Add(0);
                }
                else
                {
                    resultSet.Add(1);
                }
            }
        }
        else if (op.ToUpper().Equals("AND")) //bitwise AND operation
        {
            for (int a = 0; a < previousTermV.Count; a++)
            {
                if (previousTermV[a] == 1 && nextTermV[a] == 1)
                {
                    resultSet.Add(1);
                }
                else
                {
                    resultSet.Add(0);
                }
            }
        }
        else if (op.ToUpper().Equals("OR")) //bitwise OR operation
        {
            for (int a = 0; a < previousTermV.Count; a++)
            {
                if (previousTermV[a] == 0 && nextTermV[a] == 0)
                {
                    resultSet.Add(0);
                }
                else
                {
                    resultSet.Add(1);
                }
            }
        }
        return resultSet;
    }

    //returns term incidence vector
    public List<int> GetTermIncidenceVector(string term)
    {
        return termDocumentIncidenceMatrix[term.ToUpper()];

    }
}

您现在需要了解另一个名为Class LogManagerI used it on my Class TermDocMatrix. 这里 :

namespace WindowsFormsApplication1
 {
   internal class FileLogger
   {
    public static readonly FileLogger Instance = new FileLogger();

    private string _filename;
    private bool _showMethodName = false;
    private string _FilePath = "";

    public bool ShowMethodNames
    {
        get { return _showMethodName; }
    }

    public void Init(string filename, bool showmethodnames)
    {
        _showMethodName = showmethodnames;
        _filename = filename;
        // handle folder names as well -> create dir etc.
        _FilePath = Path.GetDirectoryName(filename);
        if (_FilePath != "")
        {
            _FilePath = Directory.CreateDirectory(_FilePath).FullName;
            if (_FilePath.EndsWith("\\") == false)
                _FilePath += "\\";
        }
    }
}
internal static class LogManager
{

    public static void Configure(string filename, bool showmethodnames)
    {
        FileLogger.Instance.Init(filename, showmethodnames);
    }
}

}

它必须工作,但它没有。请告诉我为什么它不起作用。当我要求回答时,我只会看到这个“未找到搜索结果”,无论我输入了什么样的布尔查询。

4

1 回答 1

1

您的问题出在这一行:(ProcessFiles函数)

String[] termsCollection = RemoveStopsWords(file.ToUpper().Split(' '));

您正在拆分文件的名称而不是其内容 这就是您没有搜索结果的原因

你应该这样做:

String[] termsCollection = RemoveStopsWords(File.ReadAllText(file).ToUpper().Split(' '));

现在更改您的TermDocMatrix构造函数:

public TermDocMatrix(string IndexPath,string FileName)
{
    if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
    LogManager.Configure(System.IO.Path.Combine(_Path, _FileName + ".txt"), false);
    // read all files
    LoadFiles();
}

你的LoadFiles功能:

private void LoadFiles()
{
    int count = 0;

    if (File.Exists(System.IO.Path.Combine(_Path, _FileName + ".txt")) == false)
        return;
    // load words
    string b = File.ReadAllText(System.IO.Path.Combine(_Path, _FileName + ".txt"));

    .....
}
于 2013-05-09T18:45:57.143 回答