0

我有一些遵循 svmlight/liblinear/libsvm 格式的较大文本文件:

35 1:23 2:35 3:1.2 4:353.12
12 1:21 2:31 3:1.1 4:323.12
5 1:22 2:37 3:1.5 4:343.12

实际数据文件中的一些行是:

2007  1:45.442001 2:-30.749760 3:31.785870 4:4.635690 5:-15.148940 6:0.233700 7:-11.979680 8:-9.597080 9:6.481110 10:-8.890730 11:4.024050 12:-2.288730 13:17.902040 14:1377.122192 15:1762.042603 16:947.344299 17:562.281372 18:524.268127 19:361.254089 20:514.535095 21:247.701263 22:399.788788 23:205.365204 24:211.868698 25:44.889709 26:-299.639221 27:-227.641693 28:7.342100 29:-85.800568 30:16.812111 31:-41.185810 32:-12.008580 33:-41.921169 34:-26.523121 35:-12.197650 36:89.611038 37:-266.078339 38:-230.312225 39:-168.296097 40:38.403709 41:31.840160 42:28.980709 43:-98.734467 44:26.465651 45:-30.232719 46:-1.346370 47:253.369446 48:61.734241 49:16.157650 50:185.248672 51:75.572441 52:55.173672 53:-41.088692 54:15.864380 55:16.036921 56:-142.579681 57:59.747959 58:-151.467606 59:-12.706120 60:-104.992416 61:23.105141 62:47.001789 63:-13.598940 64:-79.190453 65:-27.637449 66:38.946049 67:-55.405849 68:54.735271 69:15.105210 70:-3.795960 71:390.477203 72:17.640181 73:-68.648270 74:-62.436111 75:-31.960951 76:33.907799 77:-181.376083 78:139.840775 79:-129.488403 80:76.238258 81:-8.840460 82:-0.154390 83:137.442093 84:77.547394 85:-4.228750 86:-61.926571 87:-33.527222 88:-3.862530 89:36.424000 90:7.173090
2003  1:52.678139 2:-2.889140 3:43.952679 4:-1.392090 5:-14.933790 6:-15.868770 7:1.193790 8:0.314010 9:-4.442350 10:-5.789340 11:2.296380 12:3.228460 13:5.668400 14:702.253845 15:685.431763 16:390.788666 17:895.268372 18:324.306152 19:256.451416 20:208.506546 21:190.251114 22:152.547104 23:119.232758 24:120.306580 25:7.739700 26:-8.927490 27:41.785198 28:-77.864777 29:-3.884480 30:6.110340 31:-27.502100 32:-0.865540 33:-8.058480 34:-20.055000 35:-2.468020 36:-10.228670 37:-94.059570 38:-76.709084 39:-62.492088 40:-165.220093 41:31.989639 42:11.304370 43:-29.555790 44:15.538950 45:-13.984620 46:-3.623290 47:398.099701 48:135.859528 49:-44.194340 50:72.132782 51:42.844990 52:28.899509 53:14.900610 54:11.398700 55:1.567970 56:-35.880360 57:57.098370 58:-53.742119 59:-79.554382 60:1.636050 61:-3.932900 62:-3.789370 63:-13.990090 64:-29.146971 65:-2.626020 66:55.268539 67:-7.301310 68:54.086380 69:-3.893580 70:0.908760 71:45.458092 72:36.820759 73:-44.692669 74:-20.529881 75:-6.139750 76:2.592730 77:-136.928253 78:22.407591 79:-36.550179 80:-35.025360 81:-5.743560 82:-42.579102 83:-2.911030 84:48.728050 85:-3.081830 86:-9.388880 87:-7.271790 88:-4.009660 89:-68.962112 90:-5.215250
2005  1:45.742352 2:12.022910 3:11.030090 4:-11.607630 5:11.800540 6:-11.123890 7:-5.390580 8:-1.119810 9:-7.740860 10:-3.334210 11:-3.226590 12:1.592990 13:40.885361 14:1972.836426 15:1145.013672 16:1099.560791 17:756.863953 18:590.113098 19:643.208435 20:404.141327 21:297.472534 22:223.873596 23:201.423416 24:226.905090 25:-1.111890 26:415.191345 27:71.191254 28:-55.937180 29:-31.572590 30:-14.187500 31:51.356079 32:15.550360 33:2.802390 34:-32.016411 35:22.382231 36:22.671370 37:49.065201 38:200.912338 39:-311.135925 40:-41.306648 41:-28.763250 42:89.578720 43:-44.595200 44:36.640621 45:-31.243629 46:-13.913190 47:103.467232 48:-20.585791 49:-18.699551 50:88.995888 51:-27.768690 52:54.397881 53:45.422440 54:33.072189 55:10.182240 56:-35.116600 57:249.768066 58:-76.346748 59:41.672920 60:-38.447338 61:-25.854719 62:-22.664749 63:-13.078690 64:18.624029 65:74.230911 66:35.444550 67:12.329590 68:22.056810 69:44.172779 70:46.889111 71:106.983597 72:74.135773 73:22.619181 74:-6.700720 75:-24.821899 76:35.950630 77:14.704640 78:139.090805 79:-80.349121 80:-6.568260 81:-4.706060 82:-24.225990 83:-35.226860 84:27.777290 85:15.389340 86:58.200359 87:-61.126980 88:-10.925220 89:26.753481 90:-5.787430
2003  1:52.558830 2:2.872220 3:27.388479 4:-5.762350 5:-15.357660 6:-15.015920 7:-5.868930 8:-0.314470 9:-5.069220 10:-4.627340 11:1.094960 12:0.146510 13:4.728110 14:525.899292 15:426.023407 16:462.865997 17:545.496765 18:320.350647 19:207.019897 20:241.681976 21:215.257568 22:121.743217 23:113.572952 24:151.218094 25:11.411310 26:-8.163200 27:-21.420420 28:-34.561981 29:9.114970 30:13.452100 31:-6.900850 32:-7.353940 33:-5.067550 34:-5.165550 35:0.510740 36:7.155810 37:-159.725861 38:-25.356190 39:-102.939949 40:36.878929 41:3.154020 42:18.147430 43:-28.873240 44:21.521170 45:1.494080 46:0.855090 47:112.993828 48:69.087212 49:-56.420319 50:116.512131 51:13.048910 52:22.800711 53:9.616600 54:16.319309 55:17.906010 56:0.850250 57:54.610882 58:-23.400669 59:-66.643929 60:-12.870030 61:-6.847970 62:-2.526670 63:-7.898560 64:-6.603840 65:-3.535520 66:90.427971 67:9.425460 68:36.567829 69:25.027519 70:-1.734350 71:37.976860 72:12.334080 73:-4.718090 74:-14.820230 75:-27.884081 76:9.275720 77:-134.637466 78:32.129719 79:-36.809818 80:27.452789 81:-8.352150 82:-16.867910 83:-10.582770 84:40.101730 85:-0.540050 86:-11.547460 87:-45.358601 88:-4.556940 89:-43.173679 90:-3.337250
2005  1:51.348091 2:9.027020 3:25.337570 4:-6.625370 5:0.033670 6:-12.695650 7:-3.134000 8:2.986490 9:-6.717500 10:-1.858040 11:-1.114940 12:-0.607370 13:9.876010 14:1146.054565 15:864.638367 16:989.264832 17:552.562683 18:547.281555 19:348.295197 20:328.138306 21:270.355560 22:192.469040 23:192.741516 24:149.908417 25:-28.374889 26:-12.990660 27:-227.448273 28:-30.600500 29:64.353157 30:70.337830 31:-42.221901 32:11.576720 33:0.563700 34:-2.935510 35:-0.233660 36:30.354120 37:19.311819 38:41.338871 39:-309.293457 40:58.497639 41:17.552410 42:37.494869 43:-58.175140 44:28.816250 45:-0.601190 46:-17.050550 47:144.773346 48:110.054649 49:-196.500870 50:72.839363 51:-65.838982 52:57.181580 53:54.270790 54:16.510670 55:5.466400 56:52.857239 57:40.528839 58:97.970413 59:28.994801 60:71.283318 61:-33.390919 62:10.739650 63:-7.166210 64:-8.740700 65:-92.274887 66:78.802193 67:60.469372 68:39.829880 69:37.804260 70:-1.799860 71:114.848869 72:-51.463718 73:-125.398689 74:19.257271 75:-26.166790 76:-5.827070 77:-121.600388 78:69.711678 79:24.589479 80:80.465950 81:-6.873660 82:-20.033710 83:-66.389397 84:50.565689 85:0.277470 86:67.056572 87:-55.588459 88:-7.508590 89:28.235109 90:-0.720450
2007  1:45.846401 2:2.833760 3:-6.005060 4:-15.161500 5:-10.723850 6:-15.152330 7:5.007240 8:1.690390 9:-0.955270 10:-0.648950 11:-5.076770 12:-2.181690 13:32.260960 14:1255.578735 15:1491.808838 16:929.633484 17:884.464905 18:513.442078 19:533.146179 20:315.062347 21:355.699799 22:187.780396 23:273.448730 24:239.651230 25:42.578140 26:139.817398 27:77.404160 28:-125.109329 29:3.481800 30:8.431550 31:25.527790 32:-11.566490 33:31.547911 34:-33.194939 35:35.622768 36:98.843971 37:-74.664299 38:-55.493752 39:-142.034805 40:136.285583 41:57.116482 42:96.431488 43:-6.046260 44:17.230221 45:-9.377930 46:26.988079 47:-61.483089 48:149.761673 49:-41.559071 50:122.963379 51:-61.148178 52:63.133350 53:-4.466220 54:39.530510 55:-30.881510 56:73.216476 57:64.885696 58:-52.324692 59:157.851456 60:-9.440020 61:-59.519360 62:13.423860 63:18.665951 64:-208.924088 65:96.040138 66:-75.221062 67:112.305283 68:-17.383190 69:32.235378 70:3.296620 71:50.905499 72:-128.179276 73:-15.145500 74:-113.525757 75:-46.479698 76:18.937969 77:-139.734528 78:16.025990 79:-85.868294 80:39.124649 81:-27.972549 82:-76.794800 83:55.541039 84:88.864410 85:-8.432410 86:62.005070 87:123.561462 88:7.871000 89:-38.616798 90:26.411659
2003  1:48.135792 2:1.086430 3:9.589960 4:-12.595940 5:-0.094050 6:-12.147480 7:-13.709280 8:-3.098700 9:-1.393680 10:-4.756290 11:-1.302600 12:1.310910 13:26.323999 14:1461.063721 15:1170.573975 16:1014.678223 17:757.746460 18:515.308411 19:315.104584 20:321.322754 21:334.500549 22:224.677887 23:213.518784 24:215.590195 25:-36.780762 26:230.421402 27:31.557949 28:-134.648193 29:48.904621 30:34.625450 31:4.527690 32:12.819410 33:0.127740 34:-7.029300 35:16.655870 36:-19.999611 37:-152.366150 38:-105.105309 39:-174.247406 40:60.828758 41:49.691872 42:18.926250 43:-57.088161 44:69.231201 45:0.094510 46:24.512751 47:389.871002 48:22.275311 49:-43.229050 50:113.951782 51:-25.452620 52:38.201389 53:43.678810 54:67.439743 55:-59.954189 56:49.155460 57:60.899910 58:-95.543419 59:-44.525139 60:-77.580452 61:-15.260040 62:24.210541 63:-9.920600 64:-128.993469 65:4.179530 66:95.650917 67:-40.966881 68:7.517620 69:26.851370 70:6.279250 71:100.435966 72:74.072609 73:3.479960 74:-40.648312 75:-44.314892 76:10.898230 77:-57.362991 78:99.334396 79:-54.742260 80:104.983833 81:-2.894430 82:-54.763592 83:40.297291 84:31.450239 85:1.103620 86:28.259859 87:23.311010 88:5.403460 89:68.146980 90:-15.644380
2003  1:50.774220 2:10.306970 3:38.833832 4:2.135230 5:-12.295030 6:-16.126940 7:3.010910 8:4.843600 9:-2.116620 10:-1.904550 11:2.062730 12:3.397320 13:14.777590 14:1049.617676 15:1336.198486 16:681.697937 17:879.616577 18:462.965881 19:290.403687 20:282.654999 21:315.051910 22:192.788345 23:177.245438 24:216.466476 25:-20.513330 26:-87.685112 27:-19.301571 28:-46.450470 29:-58.279259 30:13.538130 31:5.162740 32:-0.460890 33:39.515469 34:7.114010 35:15.739370 36:22.136230 37:-81.368942 38:-147.379425 39:-98.616241 40:-29.209419 41:34.321522 42:2.060160 43:-30.081430 44:36.432110 45:-13.177250 46:2.261150 47:340.016541 48:10.768990 49:-57.379421 50:110.271263 51:-19.973940 52:3.625370 53:-10.102060 54:-1.543880 55:-7.883780 56:-3.416110 57:66.224503 58:-8.526570 59:-83.281860 60:-9.265550 61:17.536169 62:14.193860 63:-10.285960 64:-60.309750 65:-131.561417 66:85.569321 67:28.591600 68:37.395119 69:42.838001 70:6.753630 71:31.879320 72:-35.305740 73:-49.290539 74:-56.658569 75:-29.851339 76:5.755470 77:-127.410233 78:6.754480 79:-18.591490 80:48.416950 81:-5.079660 82:-44.774738 83:44.031250 84:10.971820 85:-4.908250 86:-23.094490 87:-21.832060 88:-0.139780 89:-39.147041 90:0.954250
2003  1:49.158970 2:-2.382560 3:51.106758 4:2.096510 5:-19.883600 6:-10.847170 7:1.936660 8:-0.585220 9:-9.527060 10:-5.628470 11:1.117850 12:8.080500 13:52.736198 14:1610.526367 15:1048.697876 16:699.699097 17:1093.320190 18:442.795197 19:413.717560 20:282.639801 21:327.524323 22:212.697281 23:143.314911 24:283.513153 25:-104.071472 26:84.974327 27:49.865181 28:-4.966470 29:-2.476170 30:-9.160600 31:-31.066231 32:-33.176239 33:-3.536770 34:-26.308910 35:27.660151 36:-25.920719 37:-46.777882 38:6.794870 39:-153.272095 40:-215.387848 41:20.085751 42:28.508430 43:-48.295429 44:20.006901 45:-5.075860 46:-22.038639 47:689.643738 48:51.127209 49:-57.364071 50:36.180710 51:-11.456780 52:-11.060990 53:1.324540 54:31.615129 55:-122.590012 56:-27.722010 57:164.565201 58:-8.001100 59:-46.325619 60:7.174110 61:39.002029 62:2.867760 63:-14.931470 64:-222.400345 65:-29.179220 66:91.891869 67:54.306229 68:1.223730 69:83.647133 70:37.999901 71:25.496920 72:48.499828 73:-78.835808 74:-125.095070 75:-30.123671 76:3.157580 77:-166.867477 78:54.335869 79:-52.169189 80:-156.988968 81:0.460720 82:33.267231 83:14.544800 84:26.725420 85:-10.330890 86:-56.613251 87:24.820271 88:18.069550 89:-148.310776 90:13.764190
2005  1:50.630531 2:-10.591620 3:33.569851 4:-3.657070 5:-17.252439 6:-15.497890 7:-1.123680 8:-1.232340 9:-0.589600 10:-5.671160 11:1.830230 12:3.629500 13:8.015560 14:998.391907 15:664.230408 16:489.109589 17:682.265198 18:359.168152 19:233.805527 20:221.424911 21:187.124252 22:135.840759 23:131.866730 24:134.819794 25:-22.119030 26:-4.471730 27:-25.231850 28:-47.849972 29:-27.661140 30:8.277170 31:3.009250 32:-5.689490 33:-11.178010 34:-20.120310 35:13.760620 36:-6.357670 37:-105.914413 38:-176.465897 39:-80.240990 40:-28.631901 41:28.530621 42:11.108940 43:-31.423651 44:36.545422 45:1.247940 46:-4.130270 47:214.719543 48:90.757584 49:-37.422741 50:95.684219 51:42.480240 52:9.748400 53:9.479200 54:39.394749 55:2.614500 56:-11.076560 57:56.877048 58:-77.622551 59:-96.041847 60:-36.586418 61:-12.461760 62:-11.692830 63:-11.398230 64:34.850460 65:-25.190050 66:70.402802 67:-11.947600 68:43.388599 69:19.466080 70:-0.415960 71:70.317902 72:34.508572 73:-12.467560 74:6.159420 75:-20.868429 76:8.175780 77:-126.849983 78:40.370041 79:-3.643520 80:-64.381844 81:-3.317920 82:19.025909 83:-16.542931 84:43.292210 85:-1.584870 86:-21.691681 87:15.877830 88:-3.735770 89:-70.888710 90:-1.293630

我想将文件读入data.frame。

我尝试了以下方法,但速度非常慢。我想知道是否有某种方法可以加快速度?

read.svmlight <- function( filename, K ) {
   f <- file( filename, "r")
   lines <- readLines( f )
   close(f)
   print("read lines, parsing...")

   N <- length(lines)
   results <- as.data.frame(matrix(nrow=N,ncol=K))
   for( n in 1:N ) {
      if( n %% 10 == 0 ) {
          cat("n",n,"\n")
      }
      thisline <- lines[n]
      tokens <- strsplit(thisline," ")[[1]]
      thisy <- as.numeric(tokens[1])
      results[n,1] <- thisy
      numtokens <- length(tokens)
      for( j in 2:numtokens ) {
         thistoken <- tokens[j]
         if( thistoken != '' ) {
             splittoken <- strsplit(thistoken,':')[[1]]
             id <- as.integer( splittoken[1])
             value <- as.numeric(splittoken[2])
             results[n,id+1] <- value
         }
      }
   }
   return(results)
}

此代码段每秒执行大约 1 行。该文件长约 25,000 行。

编辑:解析 10 行的时间似乎随着总行数增加,所以我猜在上面的脚本中插入时发生了一些重新分配?

4

4 回答 4

2

哦,好吧,似乎已经有一个包提供了这个。但是,我的解决方案可能仍会为 OP 提供一些内部信息:

read.svmlight2 <- function( filename ) {
  f <- file( filename, "r")
  lines <- readLines( f )
  close(f)

  temp <- strsplit(lines,'[: ]')
  temp <- do.call('rbind',temp)
  temp2 <- temp[,(1:(ncol(temp)/2))*2]
  temp2[,1] <- temp[,1]
  temp2 <- apply(temp2,2,FUN=as.numeric)
  res <- as.data.frame(temp2)

  res
}
于 2012-12-07T10:03:08.113 回答
1

您可能可以尝试以下方法:

read.table(text=gsub(" [0-9]+:", " ", readLines("test.txt")))

其中“test.txt”是您的数据文件......至少适用于您的示例数据,但我还没有检查它的效率。

于 2012-12-07T09:44:25.610 回答
1

更好的解决方案是使用 SparseMatrix 提供的可爱的构造函数。

require(Matrix)

read.svmlight2 <- function( filename ) {
    f <- file( filename, "r")
    lines = readLines( f )
    close(f)
    temp = strsplit(lines,'[: ]')
    target = sapply(temp, function(row){as.numeric(row[1])})
    raw = lapply(temp, function(row){
        n = length(row);
        matrix(as.numeric(row[2:n]), ncol=2, byrow=T)
    })
    data = do.call('rbind', mapply(cbind, x=1:n, y=raw))
    list(target=target, data=sparseMatrix(x=data[,3], i=data[,1], j=data[,2]))
}

读取一个包含 14,000 行的测试文件大约需要一秒钟。

于 2015-04-02T19:39:43.520 回答
0

好的,第一个改进似乎是插入矩阵而不是 data.frame:

results <- matrix(nrow=N,ncol=K)

...

return(as.data.frame(results))

这明显更快,大约每秒 100 行。不过,总共有 25,000 行!

于 2012-12-07T07:57:26.723 回答