我为我的猪脚本编写了一个过滤函数,但我的工作花费了太多时间。在 5 节点集群上,正在处理的数据量为 15 GB。
任何人都可以建议如何优化我的代码:
package org.apache.pig.builtin;
import java.util.*;
import java.io.IOException;
import java.util.Map;
import org.apache.pig.FilterFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.util.WrappedIOException;
public class filterIP extends FilterFunc {
ArrayList<String> Ar1=new ArrayList<String>(){
{
add("151.193.220.28");
....
//Around 2000 IP's to be filtered
add("129.22.63.207");
}
};
public Boolean exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return true;
try {
Object values = input.get(0);
if (values instanceof DataBag)
return ((DataBag)values).size() == 0;
else if (values instanceof Map)
return ((Map)values).size() == 0;
else if (values instanceof String){
for(String s:Ar1){
if(((String)values).matches(".*"+s+".*"))
return false;
}
return true;
//return !((String)values).matches(".*"+Ar1.get(1)+".*");
}
else{
return false;
// throw new IOException("Cannot test a " + DataType.findTypeName(values) + " for required match.");
}
} catch (ExecException ee) {
throw WrappedIOException.wrap("Caught exception processing input row ", ee);
}
}
}*