0

我有一个程序连接到数据库并检查重复条目并删除重复项并更新原始记录中相同文本的数量。

我的问题是,只有 100000 条记录需要几个小时,并且程序以“需要堆空间”错误结束。

如何优化该程序以快速完成任务?

public class Main {
  // serveropsdb.properties file location = /opt/serverops/scripts/serverops.properties
  private final static String PROPERTIES_FILENAME=File.separatorChar+"opt"+File.separatorChar+"serverops"+File.separatorChar+"scripts"+File.separatorChar+"serveropsdb.properties";
  private final static String  DATE_FORMAT = "yyyy/MM/dd HH:mm:ss";
  private final static String  PROPERTY_NAME_STARTTIME= "start.time";
  private final static String  PROPERTY_NAME_ENDTIME= "end.time";
  private final static String  PROPERTY_NAME_DB_TABLENAME= "db.tablename";
  private final static String  PROPERTY_NAME_DB_USERNAME= "db.username";
  private final static String  PROPERTY_NAME_DB_PASSWORD= "db.password";
  private final static String  PROPERTY_NAME_DB_NAME= "db.name";

  public static void main(String[] args) {
    System.out.println("DB consolidation started");

    Properties properties = new Properties();
    try { //loading serverops.properties file
      properties.load(new FileInputStream(PROPERTIES_FILENAME));
    } catch (IOException e) {
      System.out.println("Error loading "+PROPERTIES_FILENAME+" properties!");
      e.printStackTrace();
    }

    try { // loading jdbc driver
      Class.forName("com.mysql.jdbc.Driver");
    } catch (ClassNotFoundException e) {
      System.out.println(" JDBC Driver not found");
      e.printStackTrace();
      return;
    }

    System.out.println("MySQL JDBC Driver found! Connecting to Database");
    Connection connection = null;

    ResultSet resultSet = null;
    ResultSet findResultSet = null;

    PreparedStatement allStatement, findStatement, updateStatement;

    //date formate used yyyy/MM/dd HH:mm:ss
    DateTimeFormatter dateFromatter= DateTimeFormat.forPattern(DATE_FORMAT);

    DateTime startDate = dateFromatter.parseDateTime(properties.getProperty(PROPERTY_NAME_STARTTIME));
    DateTime endDate= dateFromatter.parseDateTime( properties.getProperty( PROPERTY_NAME_ENDTIME));

    Timestamp t1 = new Timestamp(startDate.getMillis());
    Timestamp t2 = new Timestamp(endDate.getMillis());

    StringBuilder sql;      
    String toBeRemovedIndex ;
    int updateNumEntries; 
    String toBeRemovedLog;
    String toBeRemovedHostname; 
    boolean  updateDB=false;

    try { // connecting to DB with credentials from the properties file 
      connection = DriverManager.getConnection( 
        "jdbc:mysql://localhost:3306/"+properties.getProperty(PROPERTY_NAME_DB_NAME), 
        properties.getProperty(PROPERTY_NAME_DB_USERNAME),
        properties.getProperty(PROPERTY_NAME_DB_PASSWORD));

  // getting all record between start time and end time 
      sql = new StringBuilder( "SELECT * FROM" 
                             + properties.getProperty(PROPERTY_NAME_DB_TABLENAME)
                             + " WHERE last_seen BETWEEN ? AND ?");

      allStatement = connection.prepareStatement(sql.toString());
      allStatement.setTimestamp(1, t1);
      allStatement.setTimestamp(2,t2); 
      resultSet = allStatement.executeQuery( );

      while (resultSet.next()) {
        toBeRemovedIndex  = resultSet.getString(1);
        updateNumEntries = resultSet.getInt(2);
        toBeRemovedLog = resultSet.getString(3);
        toBeRemovedHostname = resultSet.getString(5);

        // selecting the duplicate entries with logmessage , id and hostname
        sql = new StringBuilder( "SELECT * FROM "
                               + properties.getProperty(PROPERTY_NAME_DB_TABLENAME) 
                               + " where log_message=? and  id <> ? and hostname = ?"  );

        findStatement = connection.prepareStatement(sql.toString());
        findStatement.setString(1, toBeRemovedLog );
        findStatement.setString(2, toBeRemovedIndex );
        findStatement.setString(3, toBeRemovedHostname);

        findResultSet  = findStatement.executeQuery();
        String newId="";
        while( findResultSet.next()) {
          newId = findResultSet.getString(1);
          updateNumEntries +=findResultSet.getInt(2);
          updateDB = true;
        }

        if(updateDB ) { //  if duplicate entry found - deleting it from the db 
          sql = new StringBuilder( "DELETE  FROM"
                                 + properties.getProperty(PROPERTY_NAME_DB_TABLENAME)
                                 + " where id = ?" );
          updateStatement = connection.prepareStatement(sql.toString());
          updateStatement.setString(1, toBeRemovedIndex  );
          updateStatement.executeUpdate();

          // updating similar entry with number of records
          sql = new StringBuilder( "Update "
                                 + properties.getProperty(PROPERTY_NAME_DB_TABLENAME)
                                 + " set number_of_entries = ? where  id = ? " );

          updateStatement = connection.prepareStatement(sql.toString());
          updateStatement.setLong(1, updateNumEntries );
          updateStatement.setString(2, newId );
          updateStatement.executeUpdate();
          updateDB = false;
          updateStatement=null;
        }
      }
    } catch (SQLException e) {
      System.out.println("Connection Failed! Check output console");
      e.printStackTrace();
      return;
    }

    DateTime now = new DateTime();

    if(connection != null) {
      try {
        connection.close();
      } catch (SQLException e) {
        e.printStackTrace();
      }
    }

    System.out.println( "Current time = "
                      + now.toString() 
                      + " DB consoladiation done successfully between "
                      + t1.toString() + " and "+ t2.toString());
    }
 }

表结构:

   CREATE TABLE IF NOT EXISTS `syslog_parsed` (
     `id` char(36) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL,
     `number_of_entries` int(11) NOT NULL,
     `log_message` text CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL,
     `last_seen` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
     `hostname` varchar(200) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL,
     KEY `ID` (`id`)
   ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
4

1 回答 1

2

您的基本问题是您嵌套了 while 循环。您的内部循环对包含 100k 个项目的表执行 sql 查询。这意味着您有大约 100000^2 次操作,包括 100k 次 sql 调用。此外,您的所有调用都没有关闭,因此 Java 将结果集保存在内存中 - 因此会出现 heapsize 错误。

您的大部分代码都是多余的,可以用 sql 语句省略。这可能不太正确,因此您应该针对您的数据库对其进行测试,但它为您提供了关键元素:

String sql = "SELECT id FROM ( "
           + "  SELECT id, log_message, hostname FROM syslog_parsed "
           + "  WHERE last_seen BETWEEN ? AND ? "
           + "  GROUP BY id, log_message, hostname "
           + ") x "//creates a subtable with grouped items
           + "GROUP BY id "
           + "HAVING count(*) > 1"//only returns rows that appear more than once

PreparedStatement stmt = connection.prepareStatement(sql.toString());
stmt.setTimestamp(1, t1);
stmt.setTimestamp(2, t2);
resultSet =  stmt.executeQuery( );

while (resultSet.next()) {
  //Copy result
  //Delete all of that result
  //Add back in one such result
}

这应该运行得更快,因为代码只需要处理重复项(大概不超过 50k 条记录,但可能要少得多)。它将查找重复项的工作卸载到 sql 中,这非常适合此类任务。

未来的注意事项:请格式化您的代码,否则它很难阅读。您可能还想尝试关注真正重要的代码。

于 2012-05-10T16:41:00.963 回答