将语言可靠地标记为句子确实很困难,而且不会增加将 html 放入等式的复杂性。有一些应用程序等尝试处理Natural Language Processing,一个例子是在 Java(不是 Javascript)上运行的Stanford Tokenizer
正如人们不断提到的那样,正则表达式不是这个问题的解决方案,语言不是正则的,所以不要指望只有正则表达式的解决方案。
这里有一个关于 SO、CoffeeScript 或 JavaScript 中的基本 NLP 的问题——Punkt tokenizaton、简单训练的贝叶斯模型——从哪里开始?我认为这很简单地总结了 Javascript。
无论如何,为了至少给你一些你可以玩的东西,我为你敲了一个小代码。这工作得很好,直到标记/语言开始类似于任何稍微复杂或不同的东西,但最终在很长一段时间内失败了。但是,它可能足以满足您的需求,我不知道。
CSS
.emphasis {
font-style: italic;
}
.bold {
font-weight: bold;
}
.emphasis.bold {
font-style: italic;
font-weight: bold;
}
.unidentified {
background-color: pink;
}
.sentence0 {
background-color: yellow;
}
.sentence1 {
background-color: green;
}
.sentence2 {
background-color: red;
}
.whitespace {
white-space: pre;
background-color: blue;
}
Javascript
/*jslint maxerr: 50, indent: 4, browser: true */
(function () {
"use strict";
var rxOpen = new RegExp("<[^\\/].+?>"),
rxClose = new RegExp("<\\/.+?>"),
rxWhitespace = new RegExp("^\\s+?"),
rxSupStart = new RegExp("^<sup\\b[^>]*>"),
rxSupEnd = new RegExp("<\/sup>"),
sentenceEnd = [],
color = 0,
rxIndex;
sentenceEnd.push(new RegExp("[^\\d][\\.!\\?]+"));
sentenceEnd.push(new RegExp("(?=([^\\\"]*\\\"[^\\\"]*\\\")*[^\\\"]*?$)"));
sentenceEnd.push(new RegExp("(?![^\\(]*?\\))"));
sentenceEnd.push(new RegExp("(?![^\\[]*?\\])"));
sentenceEnd.push(new RegExp("(?![^\\{]*?\\})"));
sentenceEnd.push(new RegExp("(?![^\\|]*?\\|)"));
//sentenceEnd.push(new RegExp("(?![^\\\\]*?\\\\)"));
//sentenceEnd.push(new RegExp("(?![^\\/.]*\\/)")); // all could be a problem, but this one is problematic
rxIndex = new RegExp(sentenceEnd.reduce(function (previousValue, currentValue) {
return previousValue + currentValue.source;
}, ""));
function indexSentenceEnd(html) {
var index = html.search(rxIndex);
if (index !== -1) {
index += html.match(rxIndex)[0].length - 1;
}
return index;
}
function pushSpan(array, className, string, classNameOpt) {
if (className === "sentence") {
className += color % 2;
if (classNameOpt) {
className += " " + classNameOpt;
}
color += 1;
}
array.push('<span class="' + className + '">' + string + '</span>');
}
function addSupToPrevious(html, array) {
var sup = html.search(rxSupStart),
end = 0,
last;
if (sup !== -1) {
end = html.search(rxSupEnd);
if (end !== -1) {
last = array.pop();
end = end + 6;
array.push(last.slice(0, -7) + html.slice(0, end) + last.slice(-7));
}
}
return html.slice(end);
}
function leadingWhitespaces(html, array) {
var whitespace = html.search(rxWhitespace),
count = 0;
if (whitespace !== -1) {
count = html.match(rxWhitespace)[0].length;
pushSpan(array, "whitespace", html.slice(0, count));
}
return html.slice(count);
}
function paragraphIsSentence(html, array) {
var index = indexSentenceEnd(html);
if (index === -1 || index === html.length) {
pushSpan(array, "sentence", html, "paragraphIsSentence");
html = "";
}
return html;
}
function paragraphNoMarkup(html, array) {
var open = html.search(rxOpen),
index = 0;
if (open === -1) {
index = indexSentenceEnd(html);
if (index === -1) {
index = html.length;
}
pushSpan(array, "sentence", html.slice(0, index += 1), "paragraphNoMarkup");
}
return html.slice(index);
}
function sentenceUncontained(html, array) {
var open = html.search(rxOpen),
index = 0,
close;
if (open !== -1) {
index = indexSentenceEnd(html);
if (index === -1) {
index = html.length;
}
close = html.search(rxClose);
if (index < open || index > close) {
pushSpan(array, "sentence", html.slice(0, index += 1), "sentenceUncontained");
} else {
index = 0;
}
}
return html.slice(index);
}
function sentenceContained(html, array) {
var open = html.search(rxOpen),
index = 0,
close,
count;
if (open !== -1) {
index = indexSentenceEnd(html);
if (index === -1) {
index = html.length;
}
close = html.search(rxClose);
if (index > open && index < close) {
count = html.match(rxClose)[0].length;
pushSpan(array, "sentence", html.slice(0, close + count), "sentenceContained");
index = close + count;
} else {
index = 0;
}
}
return html.slice(index);
}
function anythingElse(html, array) {
pushSpan(array, "sentence2", html, "anythingElse");
return "";
}
function guessSenetences() {
var paragraphs = document.getElementsByTagName("p");
Array.prototype.forEach.call(paragraphs, function (paragraph) {
var html = paragraph.innerHTML,
length = html.length,
array = [],
safety = 100;
while (length && safety) {
html = addSupToPrevious(html, array);
if (html.length === length) {
html = leadingWhitespaces(html, array);
if (html.length === length) {
html = paragraphIsSentence(html, array);
if (html.length === length) {
html = paragraphNoMarkup(html, array);
if (html.length === length) {
html = sentenceUncontained(html, array);
if (html.length === length) {
html = sentenceContained(html, array);
if (html.length === length) {
html = anythingElse(html, array);
}
}
}
}
}
}
length = html.length;
safety -= 1;
}
paragraph.innerHTML = array.join("");
});
}
guessSenetences();
}());
在jsfiddle 上