vb.net - 卢森 | 文档中短语的出现次数

Question

我需要在仅使用 Lucene 的文档中找到像“1 年”这样的短语的出现次数。

我知道我们可以使用 TermDocs.seek(Term) 和 TermDocs.freq() 找到任何“1”或“年”的出现。

有什么办法可以发现没有出现这样的短语吗？

score 1 · Accepted Answer

一点研究，哇哦，我明白了......

首先创建了一个这样的索引阅读器对象......

Dim indexReader As Lucene.Net.Index.IndexReader
indexReader = New Lucene.Net.Index.IndexReader.Open(INDEX_DIRECTORY, True)

然后为您的每个术语创建跨度附近查询...

Dim spanQuery1 As SpanTermQuery = New SpanTermQuery(New Term(FIELD, "2"))
Dim spanQuery2 As SpanTermQuery = New SpanTermQuery(New Term(FIELD, "year"))

Dim near As SpanNearQuery = New SpanNearQuery(New SpanQuery() 
                                             {spanQuery1, spanQuery2},
                                              0, 
                                              False)

创建一个 Span 对象来存储所有匹配的 span ....

        Dim spans As Spans = near.GetSpans(indexReader)

遍历每个跨度以获取跨度出现的数量..

        Dim num As Integer = 0

        While (spans.Next)
            num += 1
        End While

现在 num 包含短语“1 年”的出现次数。

目前它显示所有文档都没有出现。您可以跳到任何文档使用

spans.SkipTo(i)

并且可以通过以下方式查找短语是否在当前文档中

spans.doc()

我从这个 PPT中得到了这个想法。也许这可以帮助你回答你的许多其他问题......

score 0 · Accepted Answer

我在 lucene.net 中创建了一个用于实现 shingle 过滤器功能的类文件。它就像java中的那个一样工作。它在 C# 中，但我想即使你转换它也能工作。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Shingle
{

 /**
 * A ShingleFilter constructs shingles (token n-grams) from a token stream.
 * In other words, it creates combinations of tokens as a single token.
 *
 * For example, the sentence "please divide this sentence into shingles"
 * might be tokenized into shingles "please divide", "divide this",
 * "this sentence", "sentence into", and "into shingles".
 *
 * This filter handles position increments > 1 by inserting filler tokens
 * (tokens with termtext "_"). It does not handle a position increment of 0.
 */

public sealed class ShingleFilter : TokenFilter
{

    private LinkedList<State> shingleBuf = new LinkedList<State>();
    private StringBuilder[] shingles;
    private String tokenType = "shingle";

    /**
     * filler token for when positionIncrement is more than 1
     */
    public static readonly char[] FILLER_TOKEN = { '_' };


    /**
     * default maximum shingle size is 2.
     */
    public const int DEFAULT_MAX_SHINGLE_SIZE = 2;

    /**
     * The string to use when joining adjacent tokens to form a shingle
     */
    public const String TOKEN_SEPARATOR = " ";

    /**
     * By default, we output unigrams (individual tokens) as well as shingles
     * (token n-grams).
     */
    private bool outputUnigrams = true;

    /**
     * maximum shingle size (number of tokens)
     */
    private int maxShingleSize;

    /**
     * Constructs a ShingleFilter with the specified single size from the
     * {@link TokenStream} <code>input</code>
     *
     * @param input input stream
     * @param maxShingleSize maximum shingle size produced by the filter.
     */
    public ShingleFilter(TokenStream input, int maxShingleSize)
        : base(input)
    {
        SetMaxShingleSize(maxShingleSize);
        this.termAtt = AddAttribute<ITermAttribute>(); ;
        this.offsetAtt = AddAttribute<IOffsetAttribute>(); ;
        this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); ;
        this.typeAtt = AddAttribute<ITypeAttribute>(); ;
    }

    /**
     * Construct a ShingleFilter with default shingle size.
     *
     * @param input input stream
     */
    public ShingleFilter(TokenStream input)
        : this(input, DEFAULT_MAX_SHINGLE_SIZE)
    {
    }

    /**
     * Construct a ShingleFilter with the specified token type for shingle tokens.
     *
     * @param input input stream
     * @param tokenType token type for shingle tokens
     */
    public ShingleFilter(TokenStream input, String tokenType)
        : this(input, DEFAULT_MAX_SHINGLE_SIZE)
    {
        setTokenType(tokenType);
    }

    /**
     * Set the type of the shingle tokens produced by this filter.
     * (default: "shingle")
     *
     * @param tokenType token tokenType
     */
    public void setTokenType(String tokenType)
    {
        this.tokenType = tokenType;
    }

    /**
     * Shall the output stream contain the input tokens (unigrams) as well as
     * shingles? (default: true.)
     *
     * @param outputUnigrams Whether or not the output stream shall contain
     * the input tokens (unigrams)
     */
    public void SetOutputUnigrams(bool outputUnigrams)
    {
        this.outputUnigrams = outputUnigrams;
    }

    /**
     * Set the max shingle size (default: 2)
     *
     * @param maxShingleSize max size of output shingles
     */
    public void SetMaxShingleSize(int maxShingleSize)
    {
        if (maxShingleSize < 2)
        {
            throw new ArgumentException("Max shingle size must be >= 2");
        }
        shingles = new StringBuilder[maxShingleSize];
        for (int i = 0; i < shingles.Length; i++)
        {
            shingles[i] = new StringBuilder();
        }
        this.maxShingleSize = maxShingleSize;
    }

    /**
     * Clear the StringBuilders that are used for storing the output shingles.
     */
    private void ClearShingles()
    {
        for (int i = 0; i < shingles.Length; i++)
        {
            shingles[i].Length = 0;
        }
    }

    private AttributeSource.State nextToken;
    private int shingleBufferPosition;
    private int[] endOffsets;

    /* (non-Javadoc)
     * @see org.apache.lucene.analysis.TokenStream#next()
     */
    public sealed override bool IncrementToken()
    {
        while (true)
        {
            if (nextToken == null)
            {
                if (!FillShingleBuffer())
                {
                    return false;
                }
            }

            nextToken = shingleBuf.First.Value;

            if (outputUnigrams)
            {
                if (shingleBufferPosition == 0)
                {
                    RestoreState(nextToken);
                    posIncrAtt.PositionIncrement = 1;
                    shingleBufferPosition++;
                    return true;
                }
            }
            else if (shingleBufferPosition % this.maxShingleSize == 0)
            {
                shingleBufferPosition++;
            }

            if (shingleBufferPosition < shingleBuf.Count)
            {
                RestoreState(nextToken);
                typeAtt.Type = tokenType;
                offsetAtt.SetOffset(offsetAtt.StartOffset, endOffsets[shingleBufferPosition]);
                StringBuilder buf = shingles[shingleBufferPosition];
                int termLength = buf.Length;
                char[] TermBuffer = termAtt.TermBuffer();
                if (TermBuffer.Length < termLength)
                    TermBuffer = termAtt.ResizeTermBuffer(termLength);
                buf.CopyTo(0, TermBuffer, 0, termLength);
                termAtt.SetTermLength(termLength);
                if ((!outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1)
                {
                    posIncrAtt.PositionIncrement = 1;
                }
                else
                {
                    posIncrAtt.PositionIncrement = 0;
                }
                shingleBufferPosition++;
                if (shingleBufferPosition == shingleBuf.Count)
                {
                    nextToken = null;
                    shingleBufferPosition = 0;
                }
                return true;
            }
            else
            {
                nextToken = null;
                shingleBufferPosition = 0;
            }
        }
    }

    private int numFillerTokensToInsert;
    private AttributeSource.State currentToken;
    private bool hasCurrentToken;

    private ITermAttribute termAtt;
    private IOffsetAttribute offsetAtt;
    private IPositionIncrementAttribute posIncrAtt;
    private ITypeAttribute typeAtt;

    /**
     * Get the next token from the input stream and push it on the token buffer.
     * If we encounter a token with position increment > 1, we put filler tokens
     * on the token buffer.
     * <p/>
     * Returns null when the end of the input stream is reached.
     * @return the next token, or null if at end of input stream
     * @throws IOException if the input stream has a problem
     */
    private bool GetNextToken()
    {

        while (true)
        {
            if (numFillerTokensToInsert > 0)
            {
                if (currentToken == null)
                {
                    currentToken = CaptureState();
                }
                else
                {
                    RestoreState(currentToken);
                }
                numFillerTokensToInsert--;
                // A filler token occupies no space
                offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
                termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length);
                return true;
            }

            if (hasCurrentToken)
            {
                if (currentToken != null)
                {
                    RestoreState(currentToken);
                    currentToken = null;
                }
                hasCurrentToken = false;
                return true;
            }

            if (!input.IncrementToken()) return false;
            hasCurrentToken = true;

            if (posIncrAtt.PositionIncrement > 1)
            {
                numFillerTokensToInsert = posIncrAtt.PositionIncrement - 1;
            }
        }
    }

    /**
     * Fill the output buffer with new shingles.
     *
     * @throws IOException if there's a problem getting the next token
     */
    private bool FillShingleBuffer()
    {
        bool addedToken = false;
        /*
         * Try to fill the shingle buffer.
         */
        do
        {
            if (GetNextToken())
            {
                shingleBuf.AddLast(CaptureState());
                if (shingleBuf.Count > maxShingleSize)
                {
                    shingleBuf.RemoveFirst();
                }
                addedToken = true;
            }
            else
            {
                break;
            }
        } while (shingleBuf.Count < maxShingleSize);

        if (shingleBuf.Count == 0)
        {
            return false;
        }

        /*
         * If no new token could be added to the shingle buffer, we have reached
         * the end of the input stream and have to discard the least recent token.
         */
        if (!addedToken)
        {
            shingleBuf.RemoveFirst();
        }

        if (shingleBuf.Count == 0)
        {
            return false;
        }

        ClearShingles();

        endOffsets = new int[shingleBuf.Count];
        // Set all offsets to 0
        endOffsets.Initialize();

        int i = 0;
        for (IEnumerator<State> it = shingleBuf.GetEnumerator(); it.MoveNext(); )
        {
            RestoreState(it.Current);
            for (int j = i; j < shingles.Length; j++)
            {
                if (shingles[j].Length != 0)
                {
                    shingles[j].Append(TOKEN_SEPARATOR);
                }
                shingles[j].Append(termAtt.TermBuffer().Take(termAtt.TermLength()).ToArray());
            }

            endOffsets[i] = offsetAtt.EndOffset;
            i++;
        }

        return true;
    }

    public override void Reset()
    {
        base.Reset();
        nextToken = null;
        shingleBufferPosition = 0;
        shingleBuf.Clear();
        numFillerTokensToInsert = 0;
        currentToken = null;
        hasCurrentToken = false;
    }
}

}

vb.net - 卢森 | 文档中短语的出现次数

2 回答 2

Related

Reference