1

我想下载存储在MySQL中的表格形式的HTML内容,然后通过PHP代码重新显示。为此,我使用 jdbc 创建了一个网络爬虫。我成功下载了所有内容,除了印地语文本被一堆 ???????? (这意味着不支持)。我的 HTML 页面完全支持 UTF-8。我的 MySQL 表和数据库也支持 utf-8,我明确地测试了它们。

在整个 Java 代码中,我尝试实现 UTF-8 编码,但仍然无处可去。

我的一些 utf-8 代码如下:

private String url = "jdbc:mysql://localhost:3306?useUnicode=yes&characterEncoding=UTF-8";

[在 url 连接字符串中]

BufferedReader bReader = new BufferedReader(new InputStreamReader(in,"UTF-8"));

[下载(阅读)html内容时]

这是大部分代码

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.*;

public class TestDataParsing2 {

private Connection conn = null;
private String url = "jdbc:mysql://localhost:3306?useUnicode=yes&characterEncoding=UTF-8";
private String dbName = "/hindi-eng"; 
private String driver = "com.mysql.jdbc.Driver";
private String userName = "root"; 
private String password = "";
private String TABLE = "dict";

private void initdb(){
     try {
          Class.forName(driver).newInstance();
          conn = DriverManager.getConnection(url+dbName,userName,password);

     } catch (Exception e) {
          e.printStackTrace();
     }
}   
private void closedb(){ 
     try {
          conn.close();
        } catch (Exception e) {
          e.printStackTrace();
        }
} 
public void process(String content){ 
    try{  ufferedReader reader =  new BufferedReader(new StringReader(content));
        String text = "";           

        boolean start1 = false;
        boolean start2 = false;
        while (( text = reader.readLine()) != null)
        {
            if(text.contains("\"a"))
            {


                //System.out.println(text);
                System.out.println("COUNTER : " + counter); 
                String id = getID(text);
                System.out.println("word : " + id);
                this.ID = id;
                String title = getTitle(text);
                System.out.println("type: " + title);
                this.title = title;

                String owner = getOwner(text);
                System.out.println("hindi meaning : " + owner);
                this.owner = owner;

                start1 = true;
            }               

            if(start1 && start2)
            {
                String s = getS(text);
                System.out.println("Usage is : " + s);
                this.s = s;
                counter++;
                insert();
                start2=false;
                start1= false;                  
            }

            if(start1)
            {start2= true;
            }               
        }
    }catch(Exception e){
        System.out.println(e);
    }   
}

public void insert(){

        String insertString = "INSERT INTO " + TABLE + " VALUES (" + this.counter + ",'" + this.ID 
                    + "','" + this.title + "','" + this.owner + "','" + this.s + "')";

                    //System.out.print(insertString);           
        try {
            Statement stmt = conn.createStatement();
            stmt.executeUpdate(insertString);
            stmt.close();
        } catch(Exception e) {
            System.out.println(e);
        }
    }   

public String getID(String text){
    String id = "";
    id = text.substring(text.indexOf("\"")+1, text.indexOf("\",")); 
    return id;
}

public String getTitle(String text){
    String title = "";
    title = text.substring(text.indexOf(",\"")+2, text.indexOf("\",\"1.")); 
    return title;
}

public String getOwner(String text){

try{
    String owner = "";
    owner = text.substring(text.indexOf("\",\"1.")+5, text.indexOf("\"<br>"));      
    int i;      
    for(i=0;i<owner.length();i++)
    {
    String fifthChar = "\u00AE";
    int codePoint = owner.codePointAt(i);
    //System.out.println(codePoint);
    //System.out.println("'" + owner.substring(i, i+1) + "': Ux00" + Integer.toString(codePoint, 16));
    //System.out.println(fifthChar);    
    }

    //System.out.println(Character.codePointAt(owner));
    //System.out.println("enetred");
    //System.out.println(owner);
    return owner;
}
catch(Exception e)
{ System.out.println(e);
System.out.println("eeee");
}

return owner;
}

public String getS(String text){
    String s = "";
    s = text.substring(0, text.indexOf("<br>"));

    return s;
}   

public String download(String path) {
    String result = "";
    try {
        URL url = new URL(path);
        URLConnection conn = url.openConnection();
        conn.setDoOutput(true);
        InputStream in = null;
        in = url.openStream();
        String content = pipe(in,"utf-8");
        //System.out.println(content);  
        result = content;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result; 
}

public String pipe(InputStream in,String charset) throws IOException {
    StringBuffer s = new StringBuffer();

    if(charset==null||"".equals(charset)){
        charset="utf-8";
    }

    String rLine = null;
    BufferedReader bReader = new BufferedReader(new InputStreamReader(in,"UTF-8"));

    while ( (rLine = bReader.readLine()) != null)
    {
        String tmp_rLine = rLine;
            s.append(tmp_rLine+"\n");
            }
        tmp_rLine = null;

    in.close(); 
    return s.toString();
}

public static void main(String[] args) {
    TestDataParsing2 tdp = new TestDataParsing2();
    tdp.initdb();
    System.out.println("process started");

        String urlPath = "file:///C:/Users/Abhinav/Downloads/Compressed/eng-hindi-dict-
                    utf8/sa.htm";
        String content = tdp.download(urlPath);
        tdp.process(content);
    tdp.closedb();
}

}

其他所有细节都正常工作。非常感谢任何有用的帮助。

4

0 回答 0