经过一番努力,我终于找到了一个好的解决方案,也许它也可以帮助其他人。当我尝试包含 18226 行的 2,367KB csv 文件时,不同 php 脚本所花费的最少时间是(1)来自fgetcsv
名为 php.net 的文档CsvImporter
,以及(2)file_get_contents => PHP 致命错误:允许内存耗尽
(1) 取 0.92574405670166 (2) 取 0.12543702125549 (字符串形式) & 0.52903485298157 (拆分为数组) 注意:此计算不包括添加到mysql。
我发现的最佳解决方案使用3.0644409656525
总计,包括添加到数据库和一些条件检查。处理一个 8MB 文件需要 11 秒。解决方案是:
$csvInfo = analyse_file($file, 5);
$lineSeperator = $csvInfo['line_ending']['value'];
$fieldSeperator = $csvInfo['delimiter']['value'];
$columns = getColumns($file);
echo '<br>========Details========<br>';
echo 'Line Sep: \t '.$lineSeperator;
echo '<br>Field Sep:\t '.$fieldSeperator;
echo '<br>Columns: ';print_r($columns);
echo '<br>========Details========<br>';
$ext = pathinfo($file, PATHINFO_EXTENSION);
$table = str_replace(' ', '_', basename($file, "." . $ext));
$rslt = table_insert($table, $columns);
if($rslt){
$query = "LOAD DATA LOCAL INFILE '".$file."' INTO TABLE $table FIELDS TERMINATED BY '$fieldSeperator' ";
var_dump(addToDb($query, false));
}
function addToDb($query, $getRec = true){
//echo '<br>Query : '.$query;
$con = @mysql_connect('localhost', 'root', '');
@mysql_select_db('rtest', $con);
$result = mysql_query($query, $con);
if($result){
if($getRec){
$data = array();
while ($row = mysql_fetch_assoc($result)) {
$data[] = $row;
}
return $data;
}else return true;
}else{
var_dump(mysql_error());
return false;
}
}
function table_insert($table_name, $table_columns) {
$queryString = "CREATE TABLE " . $table_name . " (";
$columns = '';
$values = '';
foreach ($table_columns as $column) {
$values .= (strtolower(str_replace(' ', '_', $column))) . " VARCHAR(2048), ";
}
$values = substr($values, 0, strlen($values) - 2);
$queryString .= $values . ") ";
//// echo $queryString;
return addToDb($queryString, false);
}
function getColumns($file){
$cols = array();
if (($handle = fopen($file, 'r')) !== FALSE)
{
while (($row = fgetcsv($handle)) !== FALSE)
{
$cols = $row;
if(count($cols)>0){
break;
}
}
return $cols;
}else return false;
}
function analyse_file($file, $capture_limit_in_kb = 10) {
// capture starting memory usage
$output['peak_mem']['start'] = memory_get_peak_usage(true);
// log the limit how much of the file was sampled (in Kb)
$output['read_kb'] = $capture_limit_in_kb;
// read in file
$fh = fopen($file, 'r');
$contents = fread($fh, ($capture_limit_in_kb * 1024)); // in KB
fclose($fh);
// specify allowed field delimiters
$delimiters = array(
'comma' => ',',
'semicolon' => ';',
'tab' => "\t",
'pipe' => '|',
'colon' => ':'
);
// specify allowed line endings
$line_endings = array(
'rn' => "\r\n",
'n' => "\n",
'r' => "\r",
'nr' => "\n\r"
);
// loop and count each line ending instance
foreach ($line_endings as $key => $value) {
$line_result[$key] = substr_count($contents, $value);
}
// sort by largest array value
asort($line_result);
// log to output array
$output['line_ending']['results'] = $line_result;
$output['line_ending']['count'] = end($line_result);
$output['line_ending']['key'] = key($line_result);
$output['line_ending']['value'] = $line_endings[$output['line_ending']['key']];
$lines = explode($output['line_ending']['value'], $contents);
// remove last line of array, as this maybe incomplete?
array_pop($lines);
// create a string from the legal lines
$complete_lines = implode(' ', $lines);
// log statistics to output array
$output['lines']['count'] = count($lines);
$output['lines']['length'] = strlen($complete_lines);
// loop and count each delimiter instance
foreach ($delimiters as $delimiter_key => $delimiter) {
$delimiter_result[$delimiter_key] = substr_count($complete_lines, $delimiter);
}
// sort by largest array value
asort($delimiter_result);
// log statistics to output array with largest counts as the value
$output['delimiter']['results'] = $delimiter_result;
$output['delimiter']['count'] = end($delimiter_result);
$output['delimiter']['key'] = key($delimiter_result);
$output['delimiter']['value'] = $delimiters[$output['delimiter']['key']];
// capture ending memory usage
$output['peak_mem']['end'] = memory_get_peak_usage(true);
return $output;
}