-1

为了让您了解我需要什么,我一直在使用下面的代码来解析其中的内容

标记并将每个句子包装在标记中,这样我就可以与页面上的句子进行交互。

$('p').each(function() {
        var sentences = $(this)
            .text()
            .replace(/(((?![.!?]['"]?\s).)*[.!?]['"]?)(\s|$)/g, 
                     '<span class="sentence">$1</span>$3');
        $(this).html(sentences);
    });

但是,以下行说明了我的问题:

<p>This is a <a href="#">link</a> and it is removed with the above code! Here is another sentence.</p>

我正在搜索的 <p> 标记中的嵌套标记(例如 <a>、<img> 等)将与我正在使用的代码一起删除。我需要保持这些标签完好无损,因此 <p> 标签中的内容保持不变。

我需要:

<p><span class="sentence">This is a <a href="#">link</a> and it is removed with the above code!</sentence><sentence>Here is another sentence.</sentence></p>

在阅读了这个关于使用正则表达式解析 HTML 的谷仓燃烧器之后,我得出的结论是,我需要使用某种 HTML 解析器的组合来遍历 <p> 标记中的子标记,然后使用正则表达式来查找句子。我认为我上面列出的正则表达式应该适用于我的大部分用途,如果有帮助的话。

那么:我该怎么做呢?

4

2 回答 2

0

将语言可靠地标记为句子确实很困难,而且不会增加将 html 放入等式的复杂性。有一些应用程序等尝试处理Natural Language Processing,一个例子是在 Java(不是 Javascript)上运行的Stanford Tokenizer

正如人们不断提到的那样,正则表达式不是这个问题的解决方案,语言不是正则的,所以不要指望只有正则表达式的解决方案。

这里有一个关于 SO、CoffeeScript 或 JavaScript 中的基本 NLP 的问题——Punkt tokenizaton、简单训练的贝叶斯模型——从哪里开始?我认为这很简单地总结了 Javascript。

无论如何,为了至少给你一些你可以玩的东西,我为你敲了一个小代码。这工作得很好,直到标记/语言开始类似于任何稍微复杂或不同的东西,但最终在很长一段时间内失败了。但是,它可能足以满足您的需求,我不知道。

CSS

.emphasis {
    font-style: italic;
}
.bold {
    font-weight: bold;
}
.emphasis.bold {
    font-style: italic;
    font-weight: bold;
}
.unidentified {
    background-color: pink;
}
.sentence0 {
    background-color: yellow;
}
.sentence1 {
    background-color: green;
}
.sentence2 {
    background-color: red;
}
.whitespace {
    white-space: pre;
    background-color: blue;
}

Javascript

/*jslint maxerr: 50, indent: 4, browser: true */

(function () {
    "use strict";

    var rxOpen = new RegExp("<[^\\/].+?>"),
        rxClose = new RegExp("<\\/.+?>"),
        rxWhitespace = new RegExp("^\\s+?"),
        rxSupStart = new RegExp("^<sup\\b[^>]*>"),
        rxSupEnd = new RegExp("<\/sup>"),
        sentenceEnd = [],
        color = 0,
        rxIndex;

    sentenceEnd.push(new RegExp("[^\\d][\\.!\\?]+"));
    sentenceEnd.push(new RegExp("(?=([^\\\"]*\\\"[^\\\"]*\\\")*[^\\\"]*?$)"));
    sentenceEnd.push(new RegExp("(?![^\\(]*?\\))"));
    sentenceEnd.push(new RegExp("(?![^\\[]*?\\])"));
    sentenceEnd.push(new RegExp("(?![^\\{]*?\\})"));
    sentenceEnd.push(new RegExp("(?![^\\|]*?\\|)"));
    //sentenceEnd.push(new RegExp("(?![^\\\\]*?\\\\)"));
    //sentenceEnd.push(new RegExp("(?![^\\/.]*\\/)")); // all could be a problem, but this one is problematic

    rxIndex = new RegExp(sentenceEnd.reduce(function (previousValue, currentValue) {
        return previousValue + currentValue.source;
    }, ""));

    function indexSentenceEnd(html) {
        var index = html.search(rxIndex);

        if (index !== -1) {
            index += html.match(rxIndex)[0].length - 1;
        }

        return index;
    }

    function pushSpan(array, className, string, classNameOpt) {
        if (className === "sentence") {
            className += color % 2;
            if (classNameOpt) {
                className += " " + classNameOpt;
            }

            color += 1;
        }

        array.push('<span class="' + className + '">' + string + '</span>');
    }

    function addSupToPrevious(html, array) {
        var sup = html.search(rxSupStart),
            end = 0,
            last;

        if (sup !== -1) {
            end = html.search(rxSupEnd);
            if (end !== -1) {
                last = array.pop();
                end = end + 6;
                array.push(last.slice(0, -7) + html.slice(0, end) + last.slice(-7));
            }
        }

        return html.slice(end);
    }

    function leadingWhitespaces(html, array) {
        var whitespace = html.search(rxWhitespace),
            count = 0;

        if (whitespace !== -1) {
            count = html.match(rxWhitespace)[0].length;
            pushSpan(array, "whitespace", html.slice(0, count));
        }

        return html.slice(count);
    }

    function paragraphIsSentence(html, array) {
        var index = indexSentenceEnd(html);

        if (index === -1 || index === html.length) {
            pushSpan(array, "sentence", html, "paragraphIsSentence");
            html = "";
        }

        return html;
    }

    function paragraphNoMarkup(html, array) {
        var open = html.search(rxOpen),
            index = 0;

        if (open === -1) {
            index = indexSentenceEnd(html);
            if (index === -1) {
                index = html.length;
            }

            pushSpan(array, "sentence", html.slice(0, index += 1), "paragraphNoMarkup");
        }

        return html.slice(index);
    }

    function sentenceUncontained(html, array) {
        var open = html.search(rxOpen),
            index = 0,
            close;

        if (open !== -1) {
            index = indexSentenceEnd(html);
            if (index === -1) {
                index = html.length;
            }

            close = html.search(rxClose);
            if (index < open || index > close) {
                pushSpan(array, "sentence", html.slice(0, index += 1), "sentenceUncontained");
            } else {
                index = 0;
            }
        }

        return html.slice(index);
    }

    function sentenceContained(html, array) {
        var open = html.search(rxOpen),
            index = 0,
            close,
            count;

        if (open !== -1) {
            index = indexSentenceEnd(html);
            if (index === -1) {
                index = html.length;
            }

            close = html.search(rxClose);
            if (index > open && index < close) {
                count = html.match(rxClose)[0].length;
                pushSpan(array, "sentence", html.slice(0, close + count), "sentenceContained");
                index = close + count;
            } else {
                index = 0;
            }
        }

        return html.slice(index);
    }

    function anythingElse(html, array) {
        pushSpan(array, "sentence2", html, "anythingElse");

        return "";
    }

    function guessSenetences() {
        var paragraphs = document.getElementsByTagName("p");

        Array.prototype.forEach.call(paragraphs, function (paragraph) {
            var html = paragraph.innerHTML,
                length = html.length,
                array = [],
                safety = 100;

            while (length && safety) {
                html = addSupToPrevious(html, array);
                if (html.length === length) {
                    html = leadingWhitespaces(html, array);
                    if (html.length === length) {
                        html = paragraphIsSentence(html, array);
                        if (html.length === length) {
                            html = paragraphNoMarkup(html, array);
                            if (html.length === length) {
                                html = sentenceUncontained(html, array);
                                if (html.length === length) {
                                    html = sentenceContained(html, array);
                                    if (html.length === length) {
                                        html = anythingElse(html, array);
                                    }
                                }
                            }
                        }
                    }
                }

                length = html.length;
                safety -= 1;
            }

            paragraph.innerHTML = array.join("");
        });
    }

    guessSenetences();
}());

jsfiddle 上

于 2013-05-16T18:44:14.320 回答
-1

如果要保持标签完整,则需要使用 .html() 而不是 .text() 。检查下面的代码,如果它不起作用,请告诉我。 演示

$('p').each(function() {
        var sentences = $(this)
            .html()
            .replace(/(((?![.!?]['"]?\s).)*[.!?]['"]?)(\s|$)/g, 
                     '<span class="sentence">$1</span>$3');
        $(this).html(sentences);
    });
于 2013-05-15T05:33:02.780 回答