我需要在仅使用 Lucene 的文档中找到像“1 年”这样的短语的出现次数。
我知道我们可以使用 TermDocs.seek(Term) 和 TermDocs.freq() 找到任何“1”或“年”的出现。
有什么办法可以发现没有出现这样的短语吗?
我需要在仅使用 Lucene 的文档中找到像“1 年”这样的短语的出现次数。
我知道我们可以使用 TermDocs.seek(Term) 和 TermDocs.freq() 找到任何“1”或“年”的出现。
有什么办法可以发现没有出现这样的短语吗?
一点研究,哇哦,我明白了......
首先创建了一个这样的索引阅读器对象......
Dim indexReader As Lucene.Net.Index.IndexReader
indexReader = New Lucene.Net.Index.IndexReader.Open(INDEX_DIRECTORY, True)
然后为您的每个术语创建跨度附近查询...
Dim spanQuery1 As SpanTermQuery = New SpanTermQuery(New Term(FIELD, "2"))
Dim spanQuery2 As SpanTermQuery = New SpanTermQuery(New Term(FIELD, "year"))
Dim near As SpanNearQuery = New SpanNearQuery(New SpanQuery()
{spanQuery1, spanQuery2},
0,
False)
创建一个 Span 对象来存储所有匹配的 span ....
Dim spans As Spans = near.GetSpans(indexReader)
遍历每个跨度以获取跨度出现的数量..
Dim num As Integer = 0
While (spans.Next)
num += 1
End While
现在 num 包含短语“1 年”的出现次数。
目前它显示所有文档都没有出现。您可以跳到任何文档使用
spans.SkipTo(i)
并且可以通过以下方式查找短语是否在当前文档中
spans.doc()
我从这个 PPT中得到了这个想法。也许这可以帮助你回答你的许多其他问题......
我在 lucene.net 中创建了一个用于实现 shingle 过滤器功能的类文件。它就像java中的那个一样工作。它在 C# 中,但我想即使你转换它也能工作。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Shingle
{
/**
* A ShingleFilter constructs shingles (token n-grams) from a token stream.
* In other words, it creates combinations of tokens as a single token.
*
* For example, the sentence "please divide this sentence into shingles"
* might be tokenized into shingles "please divide", "divide this",
* "this sentence", "sentence into", and "into shingles".
*
* This filter handles position increments > 1 by inserting filler tokens
* (tokens with termtext "_"). It does not handle a position increment of 0.
*/
public sealed class ShingleFilter : TokenFilter
{
private LinkedList<State> shingleBuf = new LinkedList<State>();
private StringBuilder[] shingles;
private String tokenType = "shingle";
/**
* filler token for when positionIncrement is more than 1
*/
public static readonly char[] FILLER_TOKEN = { '_' };
/**
* default maximum shingle size is 2.
*/
public const int DEFAULT_MAX_SHINGLE_SIZE = 2;
/**
* The string to use when joining adjacent tokens to form a shingle
*/
public const String TOKEN_SEPARATOR = " ";
/**
* By default, we output unigrams (individual tokens) as well as shingles
* (token n-grams).
*/
private bool outputUnigrams = true;
/**
* maximum shingle size (number of tokens)
*/
private int maxShingleSize;
/**
* Constructs a ShingleFilter with the specified single size from the
* {@link TokenStream} <code>input</code>
*
* @param input input stream
* @param maxShingleSize maximum shingle size produced by the filter.
*/
public ShingleFilter(TokenStream input, int maxShingleSize)
: base(input)
{
SetMaxShingleSize(maxShingleSize);
this.termAtt = AddAttribute<ITermAttribute>(); ;
this.offsetAtt = AddAttribute<IOffsetAttribute>(); ;
this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); ;
this.typeAtt = AddAttribute<ITypeAttribute>(); ;
}
/**
* Construct a ShingleFilter with default shingle size.
*
* @param input input stream
*/
public ShingleFilter(TokenStream input)
: this(input, DEFAULT_MAX_SHINGLE_SIZE)
{
}
/**
* Construct a ShingleFilter with the specified token type for shingle tokens.
*
* @param input input stream
* @param tokenType token type for shingle tokens
*/
public ShingleFilter(TokenStream input, String tokenType)
: this(input, DEFAULT_MAX_SHINGLE_SIZE)
{
setTokenType(tokenType);
}
/**
* Set the type of the shingle tokens produced by this filter.
* (default: "shingle")
*
* @param tokenType token tokenType
*/
public void setTokenType(String tokenType)
{
this.tokenType = tokenType;
}
/**
* Shall the output stream contain the input tokens (unigrams) as well as
* shingles? (default: true.)
*
* @param outputUnigrams Whether or not the output stream shall contain
* the input tokens (unigrams)
*/
public void SetOutputUnigrams(bool outputUnigrams)
{
this.outputUnigrams = outputUnigrams;
}
/**
* Set the max shingle size (default: 2)
*
* @param maxShingleSize max size of output shingles
*/
public void SetMaxShingleSize(int maxShingleSize)
{
if (maxShingleSize < 2)
{
throw new ArgumentException("Max shingle size must be >= 2");
}
shingles = new StringBuilder[maxShingleSize];
for (int i = 0; i < shingles.Length; i++)
{
shingles[i] = new StringBuilder();
}
this.maxShingleSize = maxShingleSize;
}
/**
* Clear the StringBuilders that are used for storing the output shingles.
*/
private void ClearShingles()
{
for (int i = 0; i < shingles.Length; i++)
{
shingles[i].Length = 0;
}
}
private AttributeSource.State nextToken;
private int shingleBufferPosition;
private int[] endOffsets;
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public sealed override bool IncrementToken()
{
while (true)
{
if (nextToken == null)
{
if (!FillShingleBuffer())
{
return false;
}
}
nextToken = shingleBuf.First.Value;
if (outputUnigrams)
{
if (shingleBufferPosition == 0)
{
RestoreState(nextToken);
posIncrAtt.PositionIncrement = 1;
shingleBufferPosition++;
return true;
}
}
else if (shingleBufferPosition % this.maxShingleSize == 0)
{
shingleBufferPosition++;
}
if (shingleBufferPosition < shingleBuf.Count)
{
RestoreState(nextToken);
typeAtt.Type = tokenType;
offsetAtt.SetOffset(offsetAtt.StartOffset, endOffsets[shingleBufferPosition]);
StringBuilder buf = shingles[shingleBufferPosition];
int termLength = buf.Length;
char[] TermBuffer = termAtt.TermBuffer();
if (TermBuffer.Length < termLength)
TermBuffer = termAtt.ResizeTermBuffer(termLength);
buf.CopyTo(0, TermBuffer, 0, termLength);
termAtt.SetTermLength(termLength);
if ((!outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1)
{
posIncrAtt.PositionIncrement = 1;
}
else
{
posIncrAtt.PositionIncrement = 0;
}
shingleBufferPosition++;
if (shingleBufferPosition == shingleBuf.Count)
{
nextToken = null;
shingleBufferPosition = 0;
}
return true;
}
else
{
nextToken = null;
shingleBufferPosition = 0;
}
}
}
private int numFillerTokensToInsert;
private AttributeSource.State currentToken;
private bool hasCurrentToken;
private ITermAttribute termAtt;
private IOffsetAttribute offsetAtt;
private IPositionIncrementAttribute posIncrAtt;
private ITypeAttribute typeAtt;
/**
* Get the next token from the input stream and push it on the token buffer.
* If we encounter a token with position increment > 1, we put filler tokens
* on the token buffer.
* <p/>
* Returns null when the end of the input stream is reached.
* @return the next token, or null if at end of input stream
* @throws IOException if the input stream has a problem
*/
private bool GetNextToken()
{
while (true)
{
if (numFillerTokensToInsert > 0)
{
if (currentToken == null)
{
currentToken = CaptureState();
}
else
{
RestoreState(currentToken);
}
numFillerTokensToInsert--;
// A filler token occupies no space
offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length);
return true;
}
if (hasCurrentToken)
{
if (currentToken != null)
{
RestoreState(currentToken);
currentToken = null;
}
hasCurrentToken = false;
return true;
}
if (!input.IncrementToken()) return false;
hasCurrentToken = true;
if (posIncrAtt.PositionIncrement > 1)
{
numFillerTokensToInsert = posIncrAtt.PositionIncrement - 1;
}
}
}
/**
* Fill the output buffer with new shingles.
*
* @throws IOException if there's a problem getting the next token
*/
private bool FillShingleBuffer()
{
bool addedToken = false;
/*
* Try to fill the shingle buffer.
*/
do
{
if (GetNextToken())
{
shingleBuf.AddLast(CaptureState());
if (shingleBuf.Count > maxShingleSize)
{
shingleBuf.RemoveFirst();
}
addedToken = true;
}
else
{
break;
}
} while (shingleBuf.Count < maxShingleSize);
if (shingleBuf.Count == 0)
{
return false;
}
/*
* If no new token could be added to the shingle buffer, we have reached
* the end of the input stream and have to discard the least recent token.
*/
if (!addedToken)
{
shingleBuf.RemoveFirst();
}
if (shingleBuf.Count == 0)
{
return false;
}
ClearShingles();
endOffsets = new int[shingleBuf.Count];
// Set all offsets to 0
endOffsets.Initialize();
int i = 0;
for (IEnumerator<State> it = shingleBuf.GetEnumerator(); it.MoveNext(); )
{
RestoreState(it.Current);
for (int j = i; j < shingles.Length; j++)
{
if (shingles[j].Length != 0)
{
shingles[j].Append(TOKEN_SEPARATOR);
}
shingles[j].Append(termAtt.TermBuffer().Take(termAtt.TermLength()).ToArray());
}
endOffsets[i] = offsetAtt.EndOffset;
i++;
}
return true;
}
public override void Reset()
{
base.Reset();
nextToken = null;
shingleBufferPosition = 0;
shingleBuf.Clear();
numFillerTokensToInsert = 0;
currentToken = null;
hasCurrentToken = false;
}
}
}