2
4

5 回答 5

20

Using erase/unique and a C++11 lambda.

#include <algorithm>
#include <iostream>
#include <string>

int main()
{
    std::string text("_hello___world__");

    text.erase(
        std::unique(
            text.begin(),
            text.end(),
            [](char a, char b){ return (a == b) && (a == '_'); }
        ),
        text.end()
    );

    std::cout << text << '\n';

    return 0;
}

If you don't want to use a lambda you could define a functor like:

class both_equal_to
{
    char value;
public:
    both_equal_to(char ch) : value(ch) {}

    bool operator()(char first, char second) const
    {
        return (first == second) && (first == value);
    }
};

and then replace the lambda with both_equal_to('_').

If you are only using char* and don't want to pay the cost of constructing a std::string the following code is closer to RolandXu's code.

char *convert_underscores(char *str)
{
    if (str)
    {
        size_t length = strlen(str);
        char *end = std::unique(str, str + length, both_equal_to('_'));
        *end = '\0';
    }
    return str;
}
于 2012-06-01T01:42:49.013 回答
3

without the libraries:

#include <stdio.h>
char *RemoveUnderScore(char *src)
{
    char* readIndex;
    char* writeIndex;
    if (src==NULL) return NULL;
    readIndex = writeIndex = src;

    while (*readIndex != '\0')
    {
        while(*readIndex !='_' && *readIndex != '\0')
            *writeIndex++ = *readIndex++;

        if (*readIndex != '\0')
            *writeIndex++ = *readIndex++;

        while (*readIndex == '_')
            readIndex++;
    }

    *writeIndex = '\0';
    return src;
}

int main(int argc,char** argv){
    char str[] = "_hello___worl__d___";
    printf(RemoveUnderScore(str));
    return 0;
}
于 2012-06-01T01:47:15.157 回答
3

This post compares the speed of the methods submitted on this page. I ran the function a million times on a 40 char string and timed how long each algorithm took.

Time Required | Algorithm Used

0.2 seconds | RolandXu's version using char*, juggling pointers to chars.

0.4 seconds | Blastfurnace's version part two, functor, without string.

2.7 seconds | Blastfurnace's version part one with the functor and string.

8.7 seconds | Eric L's version looping over the string doing find __ replace with _.

11.0 seconds | Eric L's version looping over each char and assembling a string.

11.8 seconds | Jerry Coffin's version with remove_copy_if.

C++ code to Prove the above benchmarks and timings:

#include <iostream>
#include <cstdio>
#include <ctime>
#include <cstring>
#include <algorithm>

using namespace std;
string convert_underscores_by_EricL_using_string_replace(string str){

    //Cons:
        //This is the very slowest algorithm because the complexity of this operation
        //is O(n^2) and possibly higher with the overhead of string conversions.
    //Pros:
        //This is the function is the most concise, needing only 3 lines of code.

    while(str.find("__") != string::npos){
        str = str.replace(str.find("__"), 2, "_");
    }
    return str;
}


string convert_underscores_EricL_loop_over_a_string_and_remove_repeats(string str){

    //CONS:
        //Using a string really slows things down. Algorithm is too slow.
        //Not the most concise solution, 8 lines.
        //Has lots of ugly conditionals, x, and x-1, confusing to look at.
    //PROS:
        //Fastest function of those tested.  

    int len = str.length();
    string result = "";
    if (len < 2) return str;
    result += str[0];
    for(int x = 1; x < len; x++){
        if (str[x] != str[x-1] || str[x] != '_')
            result += str[x];
    }
    return result;
}




class repeated_by_jerry_coffin {
char prev;
char val;
public:
    repeated_by_jerry_coffin(char ch) : val(ch), prev(0) {}
    bool operator()(char ch) {
        bool ret = prev == val && ch == val;
        prev = ch;
        return ret;
    }
};
string convert_underscores_jerry_coffins_with_remove_copy_if(string str){

    //CONS:
        //Algorithm is the 2nd slowest algorithm.
    //PROS:
        //Concise, intuitive, needing only 4 lines.
        //Offloads the heavy lifting to std builtin methods: std::remove_copy_if and std::back_inserter

    string result = "";
    std::remove_copy_if(str.begin(), str.end(),
        std::back_inserter(result),
        repeated_by_jerry_coffin('_'));

    return result;
}


char* convert_underscores_by_RolandXu_using_char_stars_and_pointers(char *src){

    //CONS:
        //You have to get your hands dirty with C++ pointers.
    //PROS:
        //Fastest function of those tested.  

    char* readIndex;
    char* writeIndex;
    if (src==NULL) return NULL;
    readIndex = writeIndex = src;

    while (*readIndex != '\0')
    {
        while(*readIndex !='_' && *readIndex != '\0')
            *writeIndex++ = *readIndex++;

        if (*readIndex != '\0')
            *writeIndex++ = *readIndex++;

        while (*readIndex == '_')
            readIndex++;
    }
    *writeIndex = '\0';
    return src;
}


class both_equal_to__blastfurnace_version1{
char value;
public:
    both_equal_to__blastfurnace_version1(char ch) : value(ch) {}
    bool operator()(char first, char second) const{
        return (first == second) && (first == value);
    }
};
string convert_underscores_blastfurnace_version1_with_functor(string str){

    //CONS:
        //You have to create an entirely new class that overloads an operator.  
        //The speed is harmed by the usage of string.
    //PROS:
        //Don't need to roll your own loops with pointers.

    str.erase(
        std::unique(
            str.begin(),
            str.end(),
            both_equal_to__blastfurnace_version1('_')
        ),
        str.end()
    );

    return str;
}




class both_equal_to_blastfurnace_version2{
char value;
public:
    both_equal_to_blastfurnace_version2(char ch) : value(ch) {}
    bool operator()(char first, char second) const{
        return (first == second) && (first == value);
    }
};
char *convert_underscores_blastfurnace_version2_std_unique_and_no_string(char *str){
    //CONS:
        //No problems.
    //PROS:
        //More concise/intuitive than the fastest version and nearly as fast.  Winner!

    if (str){
        size_t length = strlen(str);
        char *end = std::unique(str, str + length, both_equal_to_blastfurnace_version2('_'));
        *end = '\0';
    }
    return str;
}

void assertCharStarEquals(char* a, char* b, string msg){
    if (strcmp(a, b) == 0) cout<<"passed" << endl;
    else cout << "Failed" << msg << " should be: '" << a << "'  it returned: '" << b << "'" << endl;
}
void assertStringEquals(string a, string b, string msg){
    if (a == b) cout<<"passed" << endl;
    else cout << "Failed" << msg << " should be: '" << a << "'  it returned: '" << b << "'" << endl;
}

void test01_convert_underscores_by_RolandXu_using_char_stars_and_pointers(int numtests, string str){
    char mystr[str.length()];
    strcpy(mystr, str.c_str());

    clock_t start = clock();
    int x = 0;
    while(x < numtests){
        char* s = convert_underscores_by_RolandXu_using_char_stars_and_pointers(mystr);
        x++;
    }
    double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
    cout <<  diff << "   RolandXu's version using char*. " << '\n';

}

void test02_convert_underscores_blastfurnace_version2_std_unique_and_no_string(int numtests, string str){

    char mystr[str.length()];
    strcpy(mystr, str.c_str());

    clock_t start = clock();
    int x = 0;
    while(x < numtests){
        char* val = convert_underscores_blastfurnace_version2_std_unique_and_no_string(mystr);
        x++;
    }
    double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
    cout << diff << "   Blastfurnace's version part two, functor, without string. " <<endl;
}

void test03_convert_underscores_blastfurnace_version1_with_functor(int numtests, string str){

    clock_t start = clock();
    int x = 0;
    while(x < numtests){
        string s = convert_underscores_blastfurnace_version1_with_functor(str);
        x++;
    }
    double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
    cout << diff << "   Blastfurnace's version part one with the functor and string. " <<endl;
}
void test04_convert_underscores_by_EricL_using_string_replace(int numtests, string str){

    char mystr[str.length()];
    strcpy(mystr, str.c_str());

    clock_t start = clock();
    int x = 0;
    while(x < numtests){
        string s = convert_underscores_by_EricL_using_string_replace(mystr);
        x++;
    }
    double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
    cout<<  diff << "   Eric L's version looping over the string doing find double underscore replace with single underscore. " <<endl;
}

void test05_convert_underscores_EricL_loop_over_a_string_and_remove_repeats(int numtests, string str){

    char mystr[str.length()];
    strcpy(mystr, str.c_str());

    clock_t start = clock();
    int x = 0;
    while(x < numtests){
        string s = convert_underscores_EricL_loop_over_a_string_and_remove_repeats(mystr);
        x++;
    }
    double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
    cout <<  diff << "   Eric L's version looping over each char and assembling a string. "<< endl;
}


void test06_convert_underscores_jerry_coffins_with_remove_copy_if(int numtests, string str){
    clock_t start = clock();
    int x = 0;
    while(x < numtests){
        string s = convert_underscores_jerry_coffins_with_remove_copy_if(str);
        x++;
    }
    double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
    cout<<  diff << "   Jerry Coffin's version with remove_copy_if. " <<endl;
}

int main(){
    cout << "GO!\n";

    int numtests = 1000000;
    string test_string = "__aa_a_aaa_--__&___aa_234______3)_!___<_";
    cout << "Time | Algorithm Used.\n";

    test01_convert_underscores_by_RolandXu_using_char_stars_and_pointers(numtests, test_string);

    test02_convert_underscores_blastfurnace_version2_std_unique_and_no_string(numtests, test_string);

    test03_convert_underscores_blastfurnace_version1_with_functor(numtests, test_string);

    test04_convert_underscores_by_EricL_using_string_replace(numtests, test_string);

    test05_convert_underscores_EricL_loop_over_a_string_and_remove_repeats(numtests, test_string);

    test06_convert_underscores_jerry_coffins_with_remove_copy_if(numtests, test_string);


    //Produces the following output:



    //Extra assertion testing to make sure everyone's algorithm is correct:
    char in[30];
    char out[30];
    strcpy(in, "a__");
    strcpy(out, "a_");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "01");
    strcpy(in, "a_");
    strcpy(out, "a_");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "02");
    strcpy(in, "_______");
    strcpy(out, "_");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "03");
    strcpy(in, "__a");
    strcpy(out, "_a");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "04");
    strcpy(in, "_hello___world__");
    strcpy(out, "_hello_world_");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "05");
    strcpy(in, "");
    strcpy(out, "");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "06");
    strcpy(in, "  __  ");
    strcpy(out, "  _  ");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "07");
    strcpy(in, "U+221E");
    strcpy(out, "U+221E");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "08");
    strcpy(in, "__\u00b2__");
    strcpy(out, "_\u00b2_");
    assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "09");


    cout<< "OK\n";


    strcpy(in, "a__");
    strcpy(out, "a_");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "01");
    strcpy(in, "a_");
    strcpy(out, "a_");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "02");
    strcpy(in, "_______");
    strcpy(out, "_");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "03");
    strcpy(in, "__a");
    strcpy(out, "_a");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "04");
    strcpy(in, "_hello___world__");
    strcpy(out, "_hello_world_");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "05");
    strcpy(in, "");
    strcpy(out, "");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "06");
    strcpy(in, "  __  ");
    strcpy(out, "  _  ");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "07");
    strcpy(in, "U+221E");
    strcpy(out, "U+221E");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "08");
    strcpy(in, "__\u00b2__");
    strcpy(out, "_\u00b2_");
    assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "09");



    cout<< "OK\n";


    string in_s = "a__";
    string out_s = "a_";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "01");
    in_s = "a_";
    out_s = "a_";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "02");
    in_s = "_______";
    out_s = "_";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "03");
    in_s = "__a";
    out_s = "_a";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "04");
    in_s = "_hello___world__";
    out_s = "_hello_world_";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "05");
    in_s = "";
    out_s = "";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "06");
    in_s = "  __  ";
    out_s = "  _  ";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "07");
    in_s = "U+221E";
    out_s = "U+221E";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "08");
    in_s = "__\u00b2__";
    out_s = "_\u00b2_";
    assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "09");



    cout<< "OK\n";


    in_s = "a__";
    out_s = "a_";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "01");
    in_s = "a_";
    out_s = "a_";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "02");
    in_s = "_______";
    out_s = "_";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "03");
    in_s = "__a";
    out_s = "_a";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "04");
    in_s = "_hello___world__";
    out_s = "_hello_world_";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "05");
    in_s = "";
    out_s = "";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "06");
    in_s = "  __  ";
    out_s = "  _  ";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "07");
    in_s = "U+221E";
    out_s = "U+221E";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "08");
    in_s = "__\u00b2__";
    out_s = "_\u00b2_";
    assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "09");



    cout<< "OK\n";


    in_s = "a__";
    out_s = "a_";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "01");
    in_s = "a_";
    out_s = "a_";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "02");
    in_s = "_______";
    out_s = "_";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "03");
    in_s = "__a";
    out_s = "_a";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "04");
    in_s = "_hello___world__";
    out_s = "_hello_world_";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "05");
    in_s = "";
    out_s = "";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "06");
    in_s = "  __  ";
    out_s = "  _  ";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "07");
    in_s = "U+221E";
    out_s = "U+221E";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "08");
    in_s = "__\u00b2__";
    out_s = "_\u00b2_";
    assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "09");




    cout<< "OK\n";


    in_s = "a__";
    out_s = "a_";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "01");
    in_s = "a_";
    out_s = "a_";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "02");
    in_s = "_______";
    out_s = "_";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "03");
    in_s = "__a";
    out_s = "_a";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "04");
    in_s = "_hello___world__";
    out_s = "_hello_world_";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "05");
    in_s = "";
    out_s = "";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "06");
    in_s = "  __  ";
    out_s = "  _  ";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "07");
    in_s = "U+221E";
    out_s = "U+221E";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "08");
    in_s = "__\u00b2__";
    out_s = "_\u00b2_";
    assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "09");





    return 0;
}

What did we learn?

  1. With c++ string, 'str.length()' is really heavy, as the compiler dutifully steps one-by-one though the memory of the string, locating the end of the string in memory, counting as it goes. Don't use string, and don't use str.length();

  2. Using str.end() invokes an O(n) performance hit as well, same as item 1. Don't use str.end();

  3. Using std::unique on a char array is optimized and lightning fast. An order of magnitude faster than a for loop with string concatenation.

  4. In C++, doing a mystring[x] results in a memory lookup of that slot in memory, which takes a long time and is much slower than using a pointer and adding 1 to the pointer. Do not put mystring[x] inside a loop that iterates on x.

  5. If you must use a string, do not put mystring += anotherstring[x]; string has to step one-by-one through the entire string each time you run this line.

  6. DO NOT concatenate strings, grab a chunk of memory, define a pointer, and place characters in the pointer, and increment the pointer. Loops and concatenation with strings invokes O(n) complexity.

Much learning has occurred on this day, great songs will be sung.

于 2012-06-01T03:58:11.693 回答
2

In C++, what I gave, very inefficiently:

string convert_underscores(string str){
    int len = str.length();   
    //bad, str.length() incurs O(n) complexity.

    string result = "";
    //bad, don't use string.  string makes everything slow.

    if (len < 2) return str;  
    result += str[0];
    //bad bad bad, string concatenation incurs O(n) complexity.

    for(int x = 1; x < len; x++){  
        //This for loop incurs unnecessary management overhead.

        if (str[x] != str[x-1] || str[x] != '_'){
            result += str[x];  
            //concatenation extremely evil here: costs O(n)
            //A lookup on str[x] costs time.  
        }
    }

    return result;  
    //returning a string bad, the entire memory is moved 
    //instead of just a pointer to the first slot.
}

or even less efficient with str.replace.

string convert_underscores(string str){
    while(str.find("__") != string::npos){ 
        str = str.replace(str.find("__"), 2, "_");
    }
    return str;
}
//find incurs unnecessary O(n) complexity. string copy incurs 
//a full memory movement on contents of string.  
//n passes are required over the string.  
于 2012-06-01T01:19:10.880 回答
2

I think I'd use std::remove_copy_if, something like this:

char prev;

std::remove_copy_if(input.begin(), input.end(), 
    std::back_inserter(result), 
    [&prev] (char ch) ->bool { 
        bool ret=prev=='_' && ch == '_'; 
        prev=ch; 
        return ret;
});

Or, if you're stuck with C++03, you can do the same with an explicit functor instead of a lambda:

class repeated { 
    char prev;
    char val;
public:
    repeated(char ch) : val(ch), prev(0) {}

    bool operator()(char ch) { 
        bool ret = prev == val && ch == val; 
        prev = ch;
        return ret;
    }
};

std::remove_copy_if(input.begin(), input.end(), 
                    std::back_inserter(result), 
                    repeated('_'));

Either way, it only copies each character from input to output (at most) once, where using std::string::replace copies each character as often as the number of repeated underscores to its left.

Edit: though looking at @blastfurnace's answer, I'm not sure I really would use this -- std::unique probably is better suited to this job.

于 2012-06-01T02:18:42.517 回答