这在一定程度上取决于您要将结果用于什么。是否用于修改字符串,打印统计信息,复制单词等。它是否必须是线程安全的,是否应该支持常量字符串等。效率和简单性最重要的是什么。
一种方法是完全依赖指针。
- 指向第一个单词的指针,
ptr1
.
- 指向下一个单词的指针,
ptr2
.
- 检查 word
ptr1
对 word ptr2
。
- 前进
ptr2
到下一个单词。
- 如果未找到匹配项,则转到 1。
- 如果未找到匹配项,则前进
ptr2
到下一个单词。
- 转到 2。
用于比较单词的简单框架可能是这样的:
int compare_word(char *word1, char *word2)
{
int i;
for (i = 0;
word1[i] && word2[i] &&
word1[i] == word2[i] &&
isalpha(word1[i]); ++i)
;
return i && !isalpha(word1[i]) && !isalpha(word2[i]);
}
计算单词之间的距离时要记住的一件事是注意多字节字符串格式,例如 utf-8。一个字母可以是几个字节:
a æøå a
^---^ 8 bytes, 5 characters.
您可以使用mbstowcs
来获取多字节序列的长度,但是您还必须注意语言环境。一个典型的场景:
char *test = "æøå";
printf("%s: %u\n", test, mbstowcs(NULL, test, 0));
printf("%s: %u\n", test, strlen(test));
setlocale(LC_ALL, "");
puts("-----------------------------------------------");
printf("%s: %u\n", test, mbstowcs(NULL, test, 0));
printf("%s: %u\n", test, strlen(test));
结果:
æøå: 4294967295
æøå: 6
-----------------------------------------------
æøå: 3
æøå: 6
无论如何。作为概念示例,这里有一些代码行。请注意,这不是字节安全的——(只有 ASCII 给出合理的结果)。点画线末尾的数字是“距离:单词 1 的开头到单词 2 的开头”、“单词 1 的结尾到单词 2 的开头的距离”和“单词宽度”。示例输出:
$ ./wordword
Enter one sentence:
Lizzie Borden took an axe And gave her mother forty whacks When she saw what she had done She gave her father forty-one.
Lizzie Borden took an axe And gave her mother forty whacks When she saw what she had done She gave her father forty-one.
^---------------------------------------------------------------^ (64, 60, 4)
MATCH: 'gave' 60 bytes of separation. (Press enter for next.)
Lizzie Borden took an axe And ____ her mother forty whacks When she saw what she had done She ____ her father forty-one.
^---------------------------------------------------------------^ (64, 61, 3)
MATCH: 'her' 61 bytes of separation. (Press enter for next.)
Lizzie Borden took an axe And ____ ___ mother forty whacks When she saw what she had done She ____ ___ father forty-one.
^---------------------------------------------------------------^ (64, 59, 5)
MATCH: 'forty' 59 bytes of separation. (Press enter for next.)
Lizzie Borden took an axe And ____ ___ mother _____ whacks When she saw what she had done She ____ ___ father _____-one.
^------------^ (13, 10, 3)
MATCH: 'she' 10 bytes of separation. (Press enter for next.)
示例代码。(好吧,这里的完整性有点过火了。从 30 行开始,然后增长了一点。但是,它仍然有很多缺点,仅作为使用指针等的示例):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define FMT_BBLACK "\033[1;30m" /* Bold color black */
#define FMT_BRED "\033[1;31m" /* Bold color red */
#define FMT_BGREEN "\033[1;32m" /* Bold color green */
#define FMT_BYELLOW "\033[1;33m" /* Bold color yellow */
#define FMT_BBLUE "\033[1;34m" /* Bold color blue */
#define FMT_BMAGENTA "\033[1;35m" /* Bold color magenta */
#define FMT_BCYAN "\033[1;36m" /* Bold color cyan */
#define FMT_BWHITE "\033[1;37m" /* Bold color white */
#define FMT_NONE "\033[0m" /* Reset */
#define FMT_MATCH FMT_BRED
#define DEL_NONE 0x00 /* Keep words. (Causes re-match) */
#define DEL_WORD1 0x01 /* Remove first word-match */
#define DEL_WORD2 0x02 /* Remove second word-match */
#define DEL_BOTH (DEL_WORD1 | DEL_WORD2)
/* Print graph */
int debug = 1;
/* ********************************************************************** *
* Helper functions
* ********************************************************************** */
/* Return pointer to next alpha,
* or null on end of string) */
char *skip_noword(char *p)
{
while (*p && !isalpha(*p))
++p;
return p;
}
/* Return pointer to byte after last alpha,
* or null on end of C-string */
char *eof_word(char *p)
{
while (*p && isalpha(*p))
++p;
return p;
}
/* Return pointer to first letter of next word,
* or null on end of C-string. */
char *next_word(char *p)
{
p = eof_word(p);
return skip_noword(p);
}
/* Compare whole word starting at p1 with word starting at p2.
* Return 1 on match, else 0.
* */
int compare_word(char *p1, char *p2)
{
int i;
for (i = 0;
p1[i] && p2[i] &&
isalpha(p1[i]) &&
p1[i] == p2[i]; ++i)
;
return i && !isalpha(p1[i]) && !isalpha(p2[i]);
}
/* ********************************************************************** *
* Search routine
* ********************************************************************** */
/* Find next word with a matching entry.
* Return pointer to first word.
* Set match to matching entry.
* */
char *word_word(char *buf, char **match)
{
char *p;
*match = NULL;
buf = skip_noword(buf);
/* Outer loop.
* Advance one and one word. */
while (*buf) {
/* Inner loop.
* Compare current buf word with rest of words after it. */
p = next_word(buf);
while (*p) {
if (compare_word(buf, p)) {
*match = p;
return buf;
}
p = next_word(p);
}
buf = next_word(buf);
}
return (char*)NULL;
}
/* ********************************************************************** *
* Clear, Copy, Print etc.
* ********************************************************************** */
/* Bytes between end of one word to beginning of next.
* */
size_t words_dist(char *w1, char *w2)
{
return w2 - eof_word(w1);
}
/* Replace all alpha characters with given char.
* */
void clear_word(char *p, char r)
{
while (*p && isalpha(*p))
*p++ = r;
}
/* Return a copy of word pointed to by p.
* */
void *word_cpy(char *p)
{
void *buf;
char *start = p;
size_t n;
n = eof_word(p) - start + 1;
if (!(buf = malloc(n)))
return (void*)NULL;
memcpy(buf, start, n);
((char*)buf)[n - 1] = 0x00;
return buf;
}
/* Print graph showing position of p2 and p3 in p1.
* */
void explain(char *p1, char *p2, char *p3)
{
size_t n1 = p3 - p2;
size_t n2 = words_dist(p2, p3);
puts(p1);
while (p1++ != p2)
putchar(' ');
putchar('^');
while (++p2 != p3)
putchar('-');
printf("^ (%d, %d, %d)\n", n1, n2, n1 - n2);
}
/* Print C-string using color.
*
* */
void print_word(FILE *out, char *word)
{
fprintf(out, "%s%s%s", FMT_MATCH, word, FMT_NONE);
}
/* Print single word pointed to by p in (longer) C-string.
* Use dynamic buffer.
* */
void print_word_safe(FILE *out, char *p)
{
char *word;
if (!(word = word_cpy(p)))
return;
print_word(out, word);
free(word);
}
/* Print single word pointed to by p in (longer) C-string.
* Modify and reset source.
* */
void print_word_mod(FILE *out, char *p)
{
char *start = p;
char csave;
p = eof_word(p);
csave = *p;
*p = 0x00;
print_word(out, start);
*p = csave;
}
/* ********************************************************************** *
* Main
* ********************************************************************** */
int main(int argc, char *argv[])
{
char buf_scan[4096]; /* Buffer holding typed input. */
char *buf_start; /* Start of buffer. */
char *buf_pos; /* Current position in buffer. */
char *match; /* Position for matched word. */
int delete; /* Delete flag mask. */
debug = 1; /* 1=Print explanation. */
delete = DEL_BOTH; /* DEL_[NONE, WORD1, WORD2, BOTH] */
if (argc > 1) {
/* Use first argument instead of user input. */
buf_start = argv[1];
} else {
/* Get user input. */
buf_start = buf_scan;
fputs("Enter one sentence:\n", stderr);
if (!fgets(buf_scan, sizeof(buf_scan) - 1, stdin))
buf_scan[0] = 0x00;
buf_scan[strlen(buf_scan) - 1] = 0x00;
putc('\n', stderr);
}
buf_pos = buf_start;
/* Get next matching pair. */
while ((buf_pos = word_word(buf_pos, &match))) {
if (debug)
explain(buf_start, buf_pos, match);
/* Report findings */
fputs("MATCH: ", stderr);
print_word_mod(stderr, buf_pos);
fprintf(stderr,
" %d bytes of separation.",
words_dist(buf_pos, match)
);
/* Clear out matched word pair. */
if (delete & DEL_WORD1) {
clear_word(buf_pos, '_');
}
if (delete & DEL_WORD2) {
clear_word(match, '_');
}
/* Advance head to next word. */
buf_pos = next_word(buf_pos);
fputs(" (Press enter for next.)", stderr);
getchar();
putc('\n', stderr);
}
if (0 && debug)
printf("FINE:\n%s\n\n", buf_start);
return 0;
}