只是为了好玩,这里是一个基于Rcpp
.
#include <Rcpp.h>
using namespace Rcpp ;
// [[Rcpp::export]]
CharacterVector rcpp_conv(
CharacterVector text, CharacterVector old , CharacterVector new_){
int n = text.size() ;
int nr = old.size() ;
std::string buffer, current_old, current_new ;
size_t pos, current_size ;
CharacterVector res(n) ;
for( int i=0; i<n; i++){
buffer = text[i] ;
for( int j=0; j<nr; j++){
current_old = old[j] ;
current_size = current_old.size() ;
current_new = new_[j] ;
pos = 0 ;
pos = buffer.find( current_old ) ;
while( pos != std::string::npos ){
buffer.replace(
pos, current_size,
current_new
) ;
pos = buffer.find( current_old ) ;
}
}
res[i] = buffer ;
}
return res ;
}
为此,我获得了相当大的性能提升:
> microbenchmark(
+ html.fastconv( sometext,oldchar,newchar),
+ html.fastconvJC(sometext, oldchar, newchar),
+ rcpp_conv( sometext, oldchar, newchar)
+ )
Unit: microseconds
expr min lq median uq
1 html.fastconv(sometext, oldchar, newchar) 97.588 99.9845 101.4195 103.072
2 html.fastconvJC(sometext, oldchar, newchar) 19.945 23.3060 25.8110 28.134
3 rcpp_conv(sometext, oldchar, newchar) 4.047 5.1555 6.2340 9.275
max
1 256.061
2 40.647
3 25.763
这是一个基于该Rcpp::String
功能的实现,可从Rcpp >= 0.10.2
:
class StringConv{
public:
typedef String result_type ;
StringConv( CharacterVector old_, CharacterVector new__):
nr(old_.size()), old(old_), new_(new__){}
String operator()(String text) const {
for( int i=0; i<nr; i++){
text.replace_all( old[i], new_[i] ) ;
}
return text ;
}
private:
int nr ;
CharacterVector old ;
CharacterVector new_ ;
} ;
// [[Rcpp::export]]
CharacterVector test_sapply_string(
CharacterVector text, CharacterVector old , CharacterVector new_
){
CharacterVector res = sapply( text, StringConv( old, new_ ) ) ;
return res ;
}