我是 Java 编程新手,现在处理大型文本文件时遇到问题。我正在编写代码来处理整个文件的字符串,将其解析为一个类,我将在该类中将其转换为 xml。挑战在于我只能处理少于 70K 行的输出;如果我有超过 800K 它会抛出一个错误“java.lang.OutOfMemoryError: Java heap space”。这是我的示例文件和代码。
样本文件 1
H|20090908|
D|ABASTECEDORA NAVAL Y INDUSTRIAL, S.A. ,N|10 |9|4PANAMA |9|8 | | |1|20090908|AMLA |
D|ABDELNUR, NURY DE JESUS ,NULL |15 |9|0PANAMA |9|8 | | |1|20090908|AMLA |
D|ACECHILLY ,NULL |22 |9|0UNKNOWN |9|8 | | |1|20090908|AMLA |
D|ACEFROSTY ,NULL |24 |9|0UNKNOWN |9|8 | | |1|20090908|AMLA |
D|ACEFROSTY SHIPPING CO., LTD. ,NULL |25 |9|0MALTA |9|8 | | |1|20090908|AMLA |
T|0000013413|
这是解析器类。
import java.text.SimpleDateFormat;
public class WatchlistParser {
public Object receiveExternal(Object callback) {
Object result = null;
try {
result = this.process("external_watchlist", callback);
} catch (Exception e) {
System.out.println(e.getMessage());
}
return result;
}
public Object receiveInternal(Object callback) {
Object result = null;
try {
result = this.process("internal_watchlist", callback);
} catch (Exception e) {
System.out.println(e.getMessage());
}
return result;
}
public Object process(String filename, Object data) {
java.util.Scanner scanner = new java.util.Scanner(data.toString());
java.util.List<WatchlistEntryObject> list = new java.util.Vector<WatchlistEntryObject>();
int entryCount = 1;
String prefix="113";
if (filename.equalsIgnoreCase("internal_watchlist")) {
prefix = "113INT";
}
if (filename.equalsIgnoreCase("external_watchlist")) {
prefix = "113EXT";
}
//
// read all watchlist entry and store it into a list
SimpleDateFormat dateformatYYYYMMDD = new SimpleDateFormat("yyyyMMdd");
while (scanner.hasNext()) {
String line = scanner.nextLine();
// Get data lines
if (line.startsWith("D")) {
// System.out.println("-"+line);
// parse the data line
line = line.replace("&", "&");
line = line.replace("'", "''");
line = line.replace(">", ">");
line = line.replace("<", "<");
String fields[] = line.split("\\|");
// do validation
// field.size must 4
if (fields.length == 12) {
// do work
WatchlistEntryObject wo = new WatchlistEntryObject();
wo.setName(fields[1].trim());
wo.setId(fields[2].trim());
wo.setIdType(fields[3].trim());
wo.setAltID(fields[4].trim());
wo.setAltIDType(fields[5].trim());
wo.setReason(fields[6].trim());
try {
java.util.Date dob = dateformatYYYYMMDD.parse(fields[7].trim());
wo.setDob(dateformatYYYYMMDD.format(dob));
} catch (Exception e) {
wo.setDob("");
}
//wo.setDob(fields[7].trim());
wo.setRemark(fields[8].trim());
// Set critical will map Y/1 to 1 N/2 to 2
wo.setCriticalID(fields[9].trim());
wo.setFileName(filename);
wo.setLastMaintainDate(fields[10].trim());
wo.setLastMaintainUser(fields[11].trim());
wo.setWatchlistEntryID(wo.generateID(prefix, entryCount));
wo.setLocation(entryCount);
list.add(wo);
entryCount++;
}
} // end of if
} // end of while
StringBuffer sb = new StringBuffer();
sb.append("<Statement>DELETE FROM tbl_watch_list WHERE filename = '" + filename + "'</Statement>\n");
java.util.Iterator<WatchlistEntryObject> iterator = list.iterator();
while (iterator.hasNext()) {
WatchlistEntryObject entry = iterator.next();
sb.append(entry.getInsertSQL() + "\n");
//System.out.println(entry.getInsertSQL());
}
return encloseInXml(sb.toString());
}
//return sb.toString();
}
public String encloseInXml(String sql) {
StringBuffer sb = new StringBuffer();
sb.append("<?xml version ='1.0' encoding = 'UTF-8' standalone = 'no'?>\n");
sb.append("<VREMIT>\n");
sb.append(sql);
sb.append("</VREMIT>\n");
return sb.toString();
}
}
这是对象等级
public class WatchlistEntryObject implements Cloneable{ //-----------------------[1]
private String name;
private String id;
private String idType;
private String altID;
private String altIDType;
private String reason;
private String dob;
private String remark;
private String criticalID;
private String lastMaintainDate;
private String lastMaintainUser;
private String watchlistEntryID;
private String fileName;
private String location;
/**
* Generate id in this format xxx-ddmmyy-nnnnnn
* where nnnnnn is the count.
* @param count
* @return
*/
public static String generateID(String prefix,int count) {
Date dateNow = new Date ();
SimpleDateFormat dateformatYYYYMMDD = new SimpleDateFormat("yyMMdd");
StringBuilder nowYYYYMMDD = new StringBuilder(dateformatYYYYMMDD.format(dateNow));
String temp = String.format(prefix+"-"+nowYYYYMMDD.toString() + "-%06d", count);
return temp;
}
public static String convertCID( String s) {
return s;
}
// replace critical id from y to 1, n to 2
/**
* @return the watchlistEntryID
*/
public String getWatchlistEntryID() {
return watchlistEntryID;
}
/**
* @param watchlistEntryID the watchlistEntryID to set
*/
public void setWatchlistEntryID(String watchlistEntryID) {
this.watchlistEntryID = watchlistEntryID;
}
/**
* @return the name
*/
public String getName() {
return name;
}
/**
* @param name the name to set
*/
public void setName(String name) {
this.name = name;
}
/**
* @return the id
*/
public String getId() {
return id;
}
/**
* @param id the id to set
*/
public void setId(String id) {
this.id = id;
}
/**
* @return the idType
*/
public String getIdType() {
return idType;
}
/**
* @param idType the idType to set
*/
public void setIdType(String idType) {
try {
Byte.parseByte(idType);
this.idType = idType;
}catch (Exception e){
this.idType ="9";
}
// this.idType = idType;
}
/**
* @return the altID
*/
public String getAltID() {
return altID;
}
/**
* @param altID the altID to set
*/
public void setAltID(String altID) {
this.altID = altID;
}
/**
* @return the altIDType
*/
public String getAltIDType() {
return altIDType;
}
/**
* @param altIDType the altIDType to set
*/
public void setAltIDType(String altIDType) {
try {
Byte.parseByte(altIDType);
this.altIDType = altIDType;
}catch (Exception e){
this.altIDType ="9";
}
}
/**
* @return the reason
*/
public String getReason() {
return reason;
}
/**
* @param reason the reason to set
*/
public void setReason(String reason) {
try {
Byte.parseByte(reason);
this.reason = reason;
}catch (Exception e){
this.reason ="7";
}
}
/**
* @return the dob
*/
public String getDob() {
return dob;
}
/**
* @param dob the dob to set
*/
public void setDob(String dob) {
this.dob = dob;
}
/**
* @return the remark
*/
public String getRemark() {
return remark;
}
/**
* @param remark the remark to set
*/
public void setRemark(String remark) {
this.remark = remark;
}
/**
* @return the criticalID
*/
public String getCriticalID() {
return criticalID;
}
// inspect Critical Id and return "Y to 1" else set to "2".
/**
* @param criticalID the criticalID to set
*/
public void setCriticalID(String myID) {
if (myID.equalsIgnoreCase("N") || myID.equalsIgnoreCase("2")) {
this.criticalID = "2";
}else{
this.criticalID = "1";
}
}
/**
* @return the lastMaintainDate
*/
public String getLastMaintainDate() {
return lastMaintainDate;
}
/**
* @param lastMaintainDate the lastMaintainDate to set
*/
public void setLastMaintainDate(String lastMaintainDate) {
this.lastMaintainDate = lastMaintainDate;
}
/**
* @return the lastMaintainUser
*/
public String getLastMaintainUser() {
return lastMaintainUser;
}
/**
* @param lastMaintainUser the lastMaintainUser to set
*/
public void setLastMaintainUser(String lastMaintainUser) {
this.lastMaintainUser = lastMaintainUser;
}
/**
* @return the fileName plus with date e.g yyMMdd
*/
public String getFileName() {
return fileName;
}
/**
* @param fileName the fileName to set
*/
public void setFileName(String fileName) {
this.fileName = fileName;
}
// get location
public String getLocation() {
return location;
}
/**
* @param getting the location = object count
*/
public void setLocation(int loc) {
this.location = Integer.toString(loc);
}
public String getInsertSQL() {
return " <Statement> INSERT INTO tbl_watch_list( watch_entry_id,entry_data,id,id_type,alt_id,alt_id_type,reason,date_of_birth,remark,critical_identifier,filename,dt_last_chg,username,location) VALUES ('"+watchlistEntryID+"','"+name+"','"+id+"','"+idType+"','"+altID+"','"+altIDType+"','"+reason+"','"+dob+"','"+remark+"','"+criticalID+"','"+fileName+"', getDate() ,'Xgate','"+location+"'); </Statement>";
}
}
最后是测试课
public class TestParser {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws IOException {
// read the file into a string
String data = "";
try {//reading the files and convert it to UTF-8
data = new String(readFile("H:\\external_watchlist.txt"), "UTF-8");
} catch (Exception e) {
Trace.error("Encoding Exception", e);//catch all exceptions
}
WatchlistParser parser = new WatchlistParser();
String sql = (String) parser.receiveExternal(data);
System.out.println(sql);
}
public static byte[] readFile(String path) {
try {
//java.io.BufferedReader br = new java.io.BufferedReader(new java.io.FileReader(path));
java.io.FileInputStream fis = new java.io.FileInputStream(path);
java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream();
byte[] buffer = new byte[1024];
while (fis.available() > 0) {
int len = fis.read(buffer);
bos.write(buffer, 0, len);
}
bos.flush();
bos.close();
fis.close();
return bos.toByteArray();
} catch (Exception e) {
Trace.error("Read File Exception", e);
}
return null;
}
}
我正在考虑扫描分隔符“D|” 在字符串中计算我可以解析的最大对象行,例如每 10K 一个示例,并将输出写入单独的文件以避免“java.lang.OutOfMemoryError:Java heap space”错误。
还有其他方法可以吗??我会很感激任何建议。
谢谢。