2

我在数据库的表中有一组原始文本,我需要使用一组单词替换该集合中的一些单词。我将所有要替换的术语及其替代项放在一个文本文件中,如下所示

min=admin
lelet=lambat
lemot=lambat
nii=nih
ntu=itu

等等。我已成功启动文件和扫描仪的变量来阅读该术语及其替代品的集合。

我循环所有数据集并将原始文本保存在同一循环中的字符串中我循环所有术语集合并将其行保存为字符串名称“模式”,并将模式拆分为两个名为“术语”和“替换器”的字符串在这个循环中,我启动一个新字符串,它的值是由replaceAll(term,replacer)修改的数据集中的字符串,用于术语集合的结束循环,然后我将新字符串插入到数据库结束循环中的另一个表中以获取数据集

我按照以下方式手动执行 replaceAll("min","admin") 及其工作,但它确实可以手动编码近 2000 个术语来替换它。

任何人都曾经面对过这种真正的事情..我现在真的很需要帮助:(

 package sentimenrepo;
    import javax.swing.*;
    import java.sql.*;
    import java.io.*;
    //import java.util.HashMap;
    import java.util.Scanner;
    //import java.util.Map;
    /**
     *
     * @author herman
     */
    public class synonimReplaceV2 extends SwingWorker {
        protected Object doInBackground() throws Exception {
                 new skripsisentimen.sentimenttwitter().setVisible(true);

                  Integer row = 0;
                  File synonimV2 = new File("synV2/catatan_kata_sinonim.txt");
                  String newTweet = "";
                DB db = new DB();
            Connection conn = db.dbConnect("jdbc:mysql://localhost:3306/tweet", "root", "");
            try{
              Statement select = conn.createStatement();
              select.executeQuery("select * from synonimtweet");
              ResultSet RS = select.getResultSet();
              Scanner scSynV2 = new Scanner(synonimV2);
              while(RS.next()){
                   row++;

                       String no = RS.getString("no");
                  String tweet = " "+ RS.getString("tweet");
                  String published = RS.getString("published");
                  String label = RS.getString("label");
                  clean2 cleanv2 = new clean2();

                  newTweet = cleanv2.cleanTweet(tweet);
                    try{
                         Statement insert = conn.createStatement();
                         insert.executeUpdate("INSERT INTO synonimtweet_v2(no,tweet,published,label) values('"
                                 +no+"','"+newTweet+"','"+published+"','"+label+"')");
                        String current = skripsisentimen.sentimenttwitter.txtAreaResult.getText();
                  skripsisentimen.sentimenttwitter.txtAreaResult.setText(current+"\n"+row+"original : "+tweet+"\n"+newTweet+"\n______________________\n");
                  skripsisentimen.sentimenttwitter.lblStat.setText(row+" tweet read");
                  skripsisentimen.sentimenttwitter.txtAreaResult.setCaretPosition(skripsisentimen.sentimenttwitter.txtAreaResult.getText().length() - 1);

                    }catch(Exception e){
                         skripsisentimen.sentimenttwitter.lblStat.setText(e.getMessage());

                    }

                 skripsisentimen.sentimenttwitter.lblStat.setText(e.getMessage());

              }
            }catch(Exception e){
                skripsisentimen.sentimenttwitter.lblStat.setText(e.getMessage());

            }
           return row;
        }
        class clean2{

            public clean2(){}

            public String cleanTweet(String tweet){
               File synonimV2 = new File("synV2/catatan_kata_sinonim.txt");

                String pattern = "";
              String term = "";
              String replacer = "";
              String newTweet="";
             try{
                Scanner scSynV2 = new Scanner(synonimV2);
                 while(scSynV2.hasNext()){
                  pattern = scSynV2.next();
                  term = pattern.split("=")[0];
                  replacer = pattern.split("=")[1];
                  newTweet = tweet.replace(term, replacer);
                 }
             }catch(Exception e){
                 e.printStackTrace();
             }

                System.out.println(newTweet+"\n"+tweet);
              return newTweet;

            }
        }

    }

更新


我刚刚意识到代码实际上有效,但仅适用于数据库中的第一行,第二行等等都静止不动。这是我更新我构建的最新代码

public class synonimReplaceV2 extends SwingWorker {

    protected Object doInBackground() throws Exception {
             new skripsisentimen.sentimenttwitter().setVisible(true);

              Integer row = 0;

              String newTweet = "";
            DB db = new DB();
        Connection conn = db.dbConnect("jdbc:mysql://localhost:3306/tweet", "root", "");
        try{
          Statement select = conn.createStatement();
          select.executeQuery("select * from synonimtweet limit 2,10");
          ResultSet RS = select.getResultSet();
          FileReader readSyn = new FileReader("synV2/catatan_kata_sinonim.txt");
          BufferedReader buffSyn = new BufferedReader(readSyn);
          while(RS.next()){
               row++;
                   String no = RS.getString("no");
              String tweet = " "+ RS.getString("tweet");
              String published = RS.getString("published");
              String label = RS.getString("label");
              String pattern = "";
             while((pattern=buffSyn.readLine())!=null){
                 String patternTerm = pattern.split("=")[0];
                 String patternSubs = pattern.split("=")[1];
                 tweet = tweet.replaceAll("\\s"+patternTerm, patternSubs);
             }

                try{
                     Statement insert = conn.createStatement();
                     insert.executeUpdate("INSERT INTO synonimtweet_v2(no,tweet,published,label) values('"
                             +no+"','"+tweet+"','"+published+"','"+label+"')");
                    String current = skripsisentimen.sentimenttwitter.txtAreaResult.getText();
              skripsisentimen.sentimenttwitter.txtAreaResult.setText(current+"\n"+row+"original : "+tweet+"\n"+newTweet+"\n______________________\n");
              skripsisentimen.sentimenttwitter.lblStat.setText(row+" tweet read");
              skripsisentimen.sentimenttwitter.txtAreaResult.setCaretPosition(skripsisentimen.sentimenttwitter.txtAreaResult.getText().length() - 1);

                }catch(Exception e){
                     skripsisentimen.sentimenttwitter.lblStat.setText(e.getMessage());
                }


          }
        }catch(Exception e){
            skripsisentimen.sentimenttwitter.lblStat.setText(e.getMessage());
           // System.out.println(e.getMessage());
        }
        Thread.sleep(100);
       return row;
    }
}
4

4 回答 4

2

打开同义词文件并为您的每一行迭代超过 2,000 行ResultSet有点浪费。

一次将同义词加载到内存映射中,以唯一拼写错误的术语为键,然后在映射上查找结果集中的每一行,并根据需要进行替换。

于 2012-06-11T16:36:48.727 回答
1

让我们使用这两种解决方案为您构建一个解决方案:

首先,您使用所有键创建一个 HashMap:

public static HashMap<String, String> getMap() {
        //your version would read from the file
        HashMap<String,String> myMap=new HashMap<String,String>();
        myMap.put("min", "admin");
        myMap.put("lelet", "lambat");
        myMap.put("lemot", "lambat");
        myMap.put("nii", "nih");
        myMap.put("ntu", "itu");
        return(myMap);
    }

其次,您创建一个包含哈希图中所有键的模式:

public static String getPattern(HashMap<String,String> mapReplacement) {
        String pattern="";
        for (String s : mapReplacement.keySet()) {
            if (!pattern.isEmpty()) {
                pattern=pattern+"|";
            }
            pattern=pattern+s;
        }        
        return(pattern);
    }

接下来,您可以创建一个使用您创建的两个结构的 cleanTweet 方法:

public static String cleanTweet(String tweet, Pattern pattern,HashMap<String, String> myMap) {
        String newTweet=tweet;
        Matcher matcher = pattern.matcher(newTweet);
        int start=0;
        while (matcher.find()) {
            String key=matcher.group();
            String replacement=myMap.get(key);
            if (replacement!=null) {
                newTweet=newTweet.replace(key, replacement );
            }            
        }
        return(newTweet);
    }

这可能需要一些调整才能完美(我只测试了几个案例),但关键是您将在您的密钥中迭代一次,然后仅在您的推文上迭代。

我希望它有所帮助。

于 2012-06-11T18:03:21.010 回答
0

我没有尝试,但在我看来你几乎已经得到它 - 只需替换这一行:

newTweet = tweet.replace(term, replacer);

有了这个:

tweet = tweet.replaceAll(term, replacer);

由于您不再使用newTweet,请返回tweet

return tweet;

您还应该删除该newTweet声明。

此外,您不应该阅读Scanner阅读行。改为使用FileReader

于 2012-06-11T16:31:19.197 回答
0

谢谢大家,我找到了为什么代码不起作用的答案,

每次程序从数据库中读取一行时,都应该启动包含术语及其替代项的 txt 文件。

代码是这样的

public class synonimReplaceV2 extends SwingWorker {

    protected Object doInBackground() throws Exception {
             new skripsisentimen.sentimenttwitter().setVisible(true);

              Integer row = 0;

              String newTweet = "";
            DB db = new DB();
        Connection conn = db.dbConnect("jdbc:mysql://localhost:3306/tweet", "root", "");
        try{
          Statement select = conn.createStatement();
          select.executeQuery("select * from synonimtweet limit 2,10");
          ResultSet RS = select.getResultSet();

          while(RS.next()){
               row++;


          FileReader readSyn = new FileReader("synV2/catatan_kata_sinonim.txt");
          BufferedReader buffSyn = new BufferedReader(readSyn);

                   String no = RS.getString("no");
              String tweet = " "+ RS.getString("tweet");
              String published = RS.getString("published");
              String label = RS.getString("label");
              String pattern = "";
             while((pattern=buffSyn.readLine())!=null){
                 String patternTerm = pattern.split("=")[0];
                 String patternSubs = pattern.split("=")[1];
                 tweet = tweet.replaceAll("\\s"+patternTerm, patternSubs);
             }

                try{
                     Statement insert = conn.createStatement();
                     insert.executeUpdate("INSERT INTO synonimtweet_v2(no,tweet,published,label) values('"
                             +no+"','"+tweet+"','"+published+"','"+label+"')");
                    String current = skripsisentimen.sentimenttwitter.txtAreaResult.getText();
              skripsisentimen.sentimenttwitter.txtAreaResult.setText(current+"\n"+row+"original : "+tweet+"\n"+newTweet+"\n______________________\n");
              skripsisentimen.sentimenttwitter.lblStat.setText(row+" tweet read");
              skripsisentimen.sentimenttwitter.txtAreaResult.setCaretPosition(skripsisentimen.sentimenttwitter.txtAreaResult.getText().length() - 1);

                }catch(Exception e){
                     skripsisentimen.sentimenttwitter.lblStat.setText(e.getMessage());
                }


          }
        }catch(Exception e){
            skripsisentimen.sentimenttwitter.lblStat.setText(e.getMessage());
           // System.out.println(e.getMessage());
        }
        Thread.sleep(100);
       return row;
    }
}

但我实际上想应用上面 rlinden 制作的代码,但无法弄清楚如何调用 cleanTweet 函数。

于 2012-06-13T11:15:53.583 回答