5 回答
Using erase/unique
and a C++11 lambda.
#include <algorithm>
#include <iostream>
#include <string>
int main()
{
std::string text("_hello___world__");
text.erase(
std::unique(
text.begin(),
text.end(),
[](char a, char b){ return (a == b) && (a == '_'); }
),
text.end()
);
std::cout << text << '\n';
return 0;
}
If you don't want to use a lambda you could define a functor like:
class both_equal_to
{
char value;
public:
both_equal_to(char ch) : value(ch) {}
bool operator()(char first, char second) const
{
return (first == second) && (first == value);
}
};
and then replace the lambda with both_equal_to('_')
.
If you are only using char*
and don't want to pay the cost of constructing a std::string
the following code is closer to RolandXu's code.
char *convert_underscores(char *str)
{
if (str)
{
size_t length = strlen(str);
char *end = std::unique(str, str + length, both_equal_to('_'));
*end = '\0';
}
return str;
}
without the libraries:
#include <stdio.h>
char *RemoveUnderScore(char *src)
{
char* readIndex;
char* writeIndex;
if (src==NULL) return NULL;
readIndex = writeIndex = src;
while (*readIndex != '\0')
{
while(*readIndex !='_' && *readIndex != '\0')
*writeIndex++ = *readIndex++;
if (*readIndex != '\0')
*writeIndex++ = *readIndex++;
while (*readIndex == '_')
readIndex++;
}
*writeIndex = '\0';
return src;
}
int main(int argc,char** argv){
char str[] = "_hello___worl__d___";
printf(RemoveUnderScore(str));
return 0;
}
This post compares the speed of the methods submitted on this page. I ran the function a million times on a 40 char string and timed how long each algorithm took.
Time Required | Algorithm Used
0.2 seconds | RolandXu's version using char*, juggling pointers to chars.
0.4 seconds | Blastfurnace's version part two, functor, without string.
2.7 seconds | Blastfurnace's version part one with the functor and string.
8.7 seconds | Eric L's version looping over the string doing find __ replace with _.
11.0 seconds | Eric L's version looping over each char and assembling a string.
11.8 seconds | Jerry Coffin's version with remove_copy_if.
C++ code to Prove the above benchmarks and timings:
#include <iostream>
#include <cstdio>
#include <ctime>
#include <cstring>
#include <algorithm>
using namespace std;
string convert_underscores_by_EricL_using_string_replace(string str){
//Cons:
//This is the very slowest algorithm because the complexity of this operation
//is O(n^2) and possibly higher with the overhead of string conversions.
//Pros:
//This is the function is the most concise, needing only 3 lines of code.
while(str.find("__") != string::npos){
str = str.replace(str.find("__"), 2, "_");
}
return str;
}
string convert_underscores_EricL_loop_over_a_string_and_remove_repeats(string str){
//CONS:
//Using a string really slows things down. Algorithm is too slow.
//Not the most concise solution, 8 lines.
//Has lots of ugly conditionals, x, and x-1, confusing to look at.
//PROS:
//Fastest function of those tested.
int len = str.length();
string result = "";
if (len < 2) return str;
result += str[0];
for(int x = 1; x < len; x++){
if (str[x] != str[x-1] || str[x] != '_')
result += str[x];
}
return result;
}
class repeated_by_jerry_coffin {
char prev;
char val;
public:
repeated_by_jerry_coffin(char ch) : val(ch), prev(0) {}
bool operator()(char ch) {
bool ret = prev == val && ch == val;
prev = ch;
return ret;
}
};
string convert_underscores_jerry_coffins_with_remove_copy_if(string str){
//CONS:
//Algorithm is the 2nd slowest algorithm.
//PROS:
//Concise, intuitive, needing only 4 lines.
//Offloads the heavy lifting to std builtin methods: std::remove_copy_if and std::back_inserter
string result = "";
std::remove_copy_if(str.begin(), str.end(),
std::back_inserter(result),
repeated_by_jerry_coffin('_'));
return result;
}
char* convert_underscores_by_RolandXu_using_char_stars_and_pointers(char *src){
//CONS:
//You have to get your hands dirty with C++ pointers.
//PROS:
//Fastest function of those tested.
char* readIndex;
char* writeIndex;
if (src==NULL) return NULL;
readIndex = writeIndex = src;
while (*readIndex != '\0')
{
while(*readIndex !='_' && *readIndex != '\0')
*writeIndex++ = *readIndex++;
if (*readIndex != '\0')
*writeIndex++ = *readIndex++;
while (*readIndex == '_')
readIndex++;
}
*writeIndex = '\0';
return src;
}
class both_equal_to__blastfurnace_version1{
char value;
public:
both_equal_to__blastfurnace_version1(char ch) : value(ch) {}
bool operator()(char first, char second) const{
return (first == second) && (first == value);
}
};
string convert_underscores_blastfurnace_version1_with_functor(string str){
//CONS:
//You have to create an entirely new class that overloads an operator.
//The speed is harmed by the usage of string.
//PROS:
//Don't need to roll your own loops with pointers.
str.erase(
std::unique(
str.begin(),
str.end(),
both_equal_to__blastfurnace_version1('_')
),
str.end()
);
return str;
}
class both_equal_to_blastfurnace_version2{
char value;
public:
both_equal_to_blastfurnace_version2(char ch) : value(ch) {}
bool operator()(char first, char second) const{
return (first == second) && (first == value);
}
};
char *convert_underscores_blastfurnace_version2_std_unique_and_no_string(char *str){
//CONS:
//No problems.
//PROS:
//More concise/intuitive than the fastest version and nearly as fast. Winner!
if (str){
size_t length = strlen(str);
char *end = std::unique(str, str + length, both_equal_to_blastfurnace_version2('_'));
*end = '\0';
}
return str;
}
void assertCharStarEquals(char* a, char* b, string msg){
if (strcmp(a, b) == 0) cout<<"passed" << endl;
else cout << "Failed" << msg << " should be: '" << a << "' it returned: '" << b << "'" << endl;
}
void assertStringEquals(string a, string b, string msg){
if (a == b) cout<<"passed" << endl;
else cout << "Failed" << msg << " should be: '" << a << "' it returned: '" << b << "'" << endl;
}
void test01_convert_underscores_by_RolandXu_using_char_stars_and_pointers(int numtests, string str){
char mystr[str.length()];
strcpy(mystr, str.c_str());
clock_t start = clock();
int x = 0;
while(x < numtests){
char* s = convert_underscores_by_RolandXu_using_char_stars_and_pointers(mystr);
x++;
}
double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
cout << diff << " RolandXu's version using char*. " << '\n';
}
void test02_convert_underscores_blastfurnace_version2_std_unique_and_no_string(int numtests, string str){
char mystr[str.length()];
strcpy(mystr, str.c_str());
clock_t start = clock();
int x = 0;
while(x < numtests){
char* val = convert_underscores_blastfurnace_version2_std_unique_and_no_string(mystr);
x++;
}
double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
cout << diff << " Blastfurnace's version part two, functor, without string. " <<endl;
}
void test03_convert_underscores_blastfurnace_version1_with_functor(int numtests, string str){
clock_t start = clock();
int x = 0;
while(x < numtests){
string s = convert_underscores_blastfurnace_version1_with_functor(str);
x++;
}
double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
cout << diff << " Blastfurnace's version part one with the functor and string. " <<endl;
}
void test04_convert_underscores_by_EricL_using_string_replace(int numtests, string str){
char mystr[str.length()];
strcpy(mystr, str.c_str());
clock_t start = clock();
int x = 0;
while(x < numtests){
string s = convert_underscores_by_EricL_using_string_replace(mystr);
x++;
}
double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
cout<< diff << " Eric L's version looping over the string doing find double underscore replace with single underscore. " <<endl;
}
void test05_convert_underscores_EricL_loop_over_a_string_and_remove_repeats(int numtests, string str){
char mystr[str.length()];
strcpy(mystr, str.c_str());
clock_t start = clock();
int x = 0;
while(x < numtests){
string s = convert_underscores_EricL_loop_over_a_string_and_remove_repeats(mystr);
x++;
}
double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
cout << diff << " Eric L's version looping over each char and assembling a string. "<< endl;
}
void test06_convert_underscores_jerry_coffins_with_remove_copy_if(int numtests, string str){
clock_t start = clock();
int x = 0;
while(x < numtests){
string s = convert_underscores_jerry_coffins_with_remove_copy_if(str);
x++;
}
double diff = (std::clock() - start) / (double)CLOCKS_PER_SEC;
cout<< diff << " Jerry Coffin's version with remove_copy_if. " <<endl;
}
int main(){
cout << "GO!\n";
int numtests = 1000000;
string test_string = "__aa_a_aaa_--__&___aa_234______3)_!___<_";
cout << "Time | Algorithm Used.\n";
test01_convert_underscores_by_RolandXu_using_char_stars_and_pointers(numtests, test_string);
test02_convert_underscores_blastfurnace_version2_std_unique_and_no_string(numtests, test_string);
test03_convert_underscores_blastfurnace_version1_with_functor(numtests, test_string);
test04_convert_underscores_by_EricL_using_string_replace(numtests, test_string);
test05_convert_underscores_EricL_loop_over_a_string_and_remove_repeats(numtests, test_string);
test06_convert_underscores_jerry_coffins_with_remove_copy_if(numtests, test_string);
//Produces the following output:
//Extra assertion testing to make sure everyone's algorithm is correct:
char in[30];
char out[30];
strcpy(in, "a__");
strcpy(out, "a_");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "01");
strcpy(in, "a_");
strcpy(out, "a_");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "02");
strcpy(in, "_______");
strcpy(out, "_");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "03");
strcpy(in, "__a");
strcpy(out, "_a");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "04");
strcpy(in, "_hello___world__");
strcpy(out, "_hello_world_");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "05");
strcpy(in, "");
strcpy(out, "");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "06");
strcpy(in, " __ ");
strcpy(out, " _ ");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "07");
strcpy(in, "U+221E");
strcpy(out, "U+221E");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "08");
strcpy(in, "__\u00b2__");
strcpy(out, "_\u00b2_");
assertCharStarEquals(out, convert_underscores_by_RolandXu_using_char_stars_and_pointers(in), "09");
cout<< "OK\n";
strcpy(in, "a__");
strcpy(out, "a_");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "01");
strcpy(in, "a_");
strcpy(out, "a_");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "02");
strcpy(in, "_______");
strcpy(out, "_");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "03");
strcpy(in, "__a");
strcpy(out, "_a");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "04");
strcpy(in, "_hello___world__");
strcpy(out, "_hello_world_");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "05");
strcpy(in, "");
strcpy(out, "");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "06");
strcpy(in, " __ ");
strcpy(out, " _ ");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "07");
strcpy(in, "U+221E");
strcpy(out, "U+221E");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "08");
strcpy(in, "__\u00b2__");
strcpy(out, "_\u00b2_");
assertCharStarEquals(out, convert_underscores_blastfurnace_version2_std_unique_and_no_string(in), "09");
cout<< "OK\n";
string in_s = "a__";
string out_s = "a_";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "01");
in_s = "a_";
out_s = "a_";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "02");
in_s = "_______";
out_s = "_";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "03");
in_s = "__a";
out_s = "_a";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "04");
in_s = "_hello___world__";
out_s = "_hello_world_";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "05");
in_s = "";
out_s = "";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "06");
in_s = " __ ";
out_s = " _ ";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "07");
in_s = "U+221E";
out_s = "U+221E";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "08");
in_s = "__\u00b2__";
out_s = "_\u00b2_";
assertStringEquals(out_s, convert_underscores_blastfurnace_version1_with_functor(in_s), "09");
cout<< "OK\n";
in_s = "a__";
out_s = "a_";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "01");
in_s = "a_";
out_s = "a_";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "02");
in_s = "_______";
out_s = "_";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "03");
in_s = "__a";
out_s = "_a";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "04");
in_s = "_hello___world__";
out_s = "_hello_world_";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "05");
in_s = "";
out_s = "";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "06");
in_s = " __ ";
out_s = " _ ";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "07");
in_s = "U+221E";
out_s = "U+221E";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "08");
in_s = "__\u00b2__";
out_s = "_\u00b2_";
assertStringEquals(out_s, convert_underscores_by_EricL_using_string_replace(in_s), "09");
cout<< "OK\n";
in_s = "a__";
out_s = "a_";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "01");
in_s = "a_";
out_s = "a_";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "02");
in_s = "_______";
out_s = "_";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "03");
in_s = "__a";
out_s = "_a";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "04");
in_s = "_hello___world__";
out_s = "_hello_world_";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "05");
in_s = "";
out_s = "";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "06");
in_s = " __ ";
out_s = " _ ";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "07");
in_s = "U+221E";
out_s = "U+221E";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "08");
in_s = "__\u00b2__";
out_s = "_\u00b2_";
assertStringEquals(out_s, convert_underscores_EricL_loop_over_a_string_and_remove_repeats(in_s), "09");
cout<< "OK\n";
in_s = "a__";
out_s = "a_";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "01");
in_s = "a_";
out_s = "a_";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "02");
in_s = "_______";
out_s = "_";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "03");
in_s = "__a";
out_s = "_a";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "04");
in_s = "_hello___world__";
out_s = "_hello_world_";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "05");
in_s = "";
out_s = "";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "06");
in_s = " __ ";
out_s = " _ ";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "07");
in_s = "U+221E";
out_s = "U+221E";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "08");
in_s = "__\u00b2__";
out_s = "_\u00b2_";
assertStringEquals(out_s, convert_underscores_jerry_coffins_with_remove_copy_if(in_s), "09");
return 0;
}
What did we learn?
With c++ string, 'str.length()' is really heavy, as the compiler dutifully steps one-by-one though the memory of the string, locating the end of the string in memory, counting as it goes. Don't use string, and don't use str.length();
Using str.end() invokes an O(n) performance hit as well, same as item 1. Don't use str.end();
Using std::unique on a char array is optimized and lightning fast. An order of magnitude faster than a for loop with string concatenation.
In C++, doing a mystring[x] results in a memory lookup of that slot in memory, which takes a long time and is much slower than using a pointer and adding 1 to the pointer. Do not put mystring[x] inside a loop that iterates on x.
If you must use a string, do not put mystring += anotherstring[x]; string has to step one-by-one through the entire string each time you run this line.
DO NOT concatenate strings, grab a chunk of memory, define a pointer, and place characters in the pointer, and increment the pointer. Loops and concatenation with strings invokes O(n) complexity.
Much learning has occurred on this day, great songs will be sung.
In C++, what I gave, very inefficiently:
string convert_underscores(string str){
int len = str.length();
//bad, str.length() incurs O(n) complexity.
string result = "";
//bad, don't use string. string makes everything slow.
if (len < 2) return str;
result += str[0];
//bad bad bad, string concatenation incurs O(n) complexity.
for(int x = 1; x < len; x++){
//This for loop incurs unnecessary management overhead.
if (str[x] != str[x-1] || str[x] != '_'){
result += str[x];
//concatenation extremely evil here: costs O(n)
//A lookup on str[x] costs time.
}
}
return result;
//returning a string bad, the entire memory is moved
//instead of just a pointer to the first slot.
}
or even less efficient with str.replace.
string convert_underscores(string str){
while(str.find("__") != string::npos){
str = str.replace(str.find("__"), 2, "_");
}
return str;
}
//find incurs unnecessary O(n) complexity. string copy incurs
//a full memory movement on contents of string.
//n passes are required over the string.
I think I'd use std::remove_copy_if
, something like this:
char prev;
std::remove_copy_if(input.begin(), input.end(),
std::back_inserter(result),
[&prev] (char ch) ->bool {
bool ret=prev=='_' && ch == '_';
prev=ch;
return ret;
});
Or, if you're stuck with C++03, you can do the same with an explicit functor instead of a lambda:
class repeated {
char prev;
char val;
public:
repeated(char ch) : val(ch), prev(0) {}
bool operator()(char ch) {
bool ret = prev == val && ch == val;
prev = ch;
return ret;
}
};
std::remove_copy_if(input.begin(), input.end(),
std::back_inserter(result),
repeated('_'));
Either way, it only copies each character from input to output (at most) once, where using std::string::replace
copies each character as often as the number of repeated underscores to its left.
Edit: though looking at @blastfurnace's answer, I'm not sure I really would use this -- std::unique
probably is better suited to this job.