0

我有以下管道:

        var mlContext = new MLContext();

        var data = mlContext.Data.LoadFromEnumerable(new[]
        {
            new Input {Message = "one two three one two three"},
            new Input {Message = "one two"},
            new Input {Message = "two three"}
        });

        var pipeline =
            mlContext.Transforms.Text.FeaturizeText("TextFeatures", "Message")
                .Append(mlContext.Transforms.Text.TokenizeCharacters(
                    "MessageTokens", "Message"))
                .Append(mlContext.Transforms.Text.ProduceNgrams(
                    "MessageNgrams", "MessageTokens", 2));

        var transformedData = pipeline.Fit(data).Transform(data);

管道执行后的“MessageNgrams”列包含浮点向量。如何获得实际的 Ngram,即“一二”、“二树”?

4

2 回答 2

1
List<string> getNgram(string str, int nsize)
{
    var mlContext = new MLContext();
    var dataview = mlContext.Data.LoadFromEnumerable(new List<TextData>() { new TextData { Text = str } });

    var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")
        .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
        .Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens",
            ngramLength: nsize,
            useAllLengths: false,
            weighting: NgramExtractingEstimator.WeightingCriteria.Tf));

    var textTransformer = textPipeline.Fit(dataview);
    var transformedDataView = textTransformer.Transform(dataview);

    VBuffer<ReadOnlyMemory<char>> slotNames = default;
    transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
    var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["NgramFeatures"]);
    var ngrams = slotNames.GetValues().ToArray().Select(x=>x.Span.ToString().Replace('|',' '));

    return ngrams.ToList();
}
于 2020-07-15T10:38:26.263 回答
0

这是适用于我的 SCDA 的代码片段,应该非常相似,或者为您提供有关如何实现它的提示。

var slotLabelBuffer = default(VBuffer<ReadOnlyMemory<char>>);
transformedData.Schema["MessageNgrams"].GetSlotNames(ref slotLabelBuffer);

var slotLabels = new Dictionary<int, string>();
for (int i = 0; i < slotLabelBuffer.Length; i++)
   slotLabels.Add(i, slotLabelBuffer.GetItemOrDefault(i).ToString());

slotLabels 的索引对应于您的浮点向量。

于 2019-03-13T15:08:56.877 回答