我想下载存储在MySQL中的表格形式的HTML内容,然后通过PHP代码重新显示。为此,我使用 jdbc 创建了一个网络爬虫。我成功下载了所有内容,除了印地语文本被一堆 ???????? (这意味着不支持)。我的 HTML 页面完全支持 UTF-8。我的 MySQL 表和数据库也支持 utf-8,我明确地测试了它们。
在整个 Java 代码中,我尝试实现 UTF-8 编码,但仍然无处可去。
我的一些 utf-8 代码如下:
private String url = "jdbc:mysql://localhost:3306?useUnicode=yes&characterEncoding=UTF-8";
[在 url 连接字符串中]
BufferedReader bReader = new BufferedReader(new InputStreamReader(in,"UTF-8"));
[下载(阅读)html内容时]
这是大部分代码
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.*;
public class TestDataParsing2 {
private Connection conn = null;
private String url = "jdbc:mysql://localhost:3306?useUnicode=yes&characterEncoding=UTF-8";
private String dbName = "/hindi-eng";
private String driver = "com.mysql.jdbc.Driver";
private String userName = "root";
private String password = "";
private String TABLE = "dict";
private void initdb(){
try {
Class.forName(driver).newInstance();
conn = DriverManager.getConnection(url+dbName,userName,password);
} catch (Exception e) {
e.printStackTrace();
}
}
private void closedb(){
try {
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public void process(String content){
try{ ufferedReader reader = new BufferedReader(new StringReader(content));
String text = "";
boolean start1 = false;
boolean start2 = false;
while (( text = reader.readLine()) != null)
{
if(text.contains("\"a"))
{
//System.out.println(text);
System.out.println("COUNTER : " + counter);
String id = getID(text);
System.out.println("word : " + id);
this.ID = id;
String title = getTitle(text);
System.out.println("type: " + title);
this.title = title;
String owner = getOwner(text);
System.out.println("hindi meaning : " + owner);
this.owner = owner;
start1 = true;
}
if(start1 && start2)
{
String s = getS(text);
System.out.println("Usage is : " + s);
this.s = s;
counter++;
insert();
start2=false;
start1= false;
}
if(start1)
{start2= true;
}
}
}catch(Exception e){
System.out.println(e);
}
}
public void insert(){
String insertString = "INSERT INTO " + TABLE + " VALUES (" + this.counter + ",'" + this.ID
+ "','" + this.title + "','" + this.owner + "','" + this.s + "')";
//System.out.print(insertString);
try {
Statement stmt = conn.createStatement();
stmt.executeUpdate(insertString);
stmt.close();
} catch(Exception e) {
System.out.println(e);
}
}
public String getID(String text){
String id = "";
id = text.substring(text.indexOf("\"")+1, text.indexOf("\","));
return id;
}
public String getTitle(String text){
String title = "";
title = text.substring(text.indexOf(",\"")+2, text.indexOf("\",\"1."));
return title;
}
public String getOwner(String text){
try{
String owner = "";
owner = text.substring(text.indexOf("\",\"1.")+5, text.indexOf("\"<br>"));
int i;
for(i=0;i<owner.length();i++)
{
String fifthChar = "\u00AE";
int codePoint = owner.codePointAt(i);
//System.out.println(codePoint);
//System.out.println("'" + owner.substring(i, i+1) + "': Ux00" + Integer.toString(codePoint, 16));
//System.out.println(fifthChar);
}
//System.out.println(Character.codePointAt(owner));
//System.out.println("enetred");
//System.out.println(owner);
return owner;
}
catch(Exception e)
{ System.out.println(e);
System.out.println("eeee");
}
return owner;
}
public String getS(String text){
String s = "";
s = text.substring(0, text.indexOf("<br>"));
return s;
}
public String download(String path) {
String result = "";
try {
URL url = new URL(path);
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
InputStream in = null;
in = url.openStream();
String content = pipe(in,"utf-8");
//System.out.println(content);
result = content;
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
public String pipe(InputStream in,String charset) throws IOException {
StringBuffer s = new StringBuffer();
if(charset==null||"".equals(charset)){
charset="utf-8";
}
String rLine = null;
BufferedReader bReader = new BufferedReader(new InputStreamReader(in,"UTF-8"));
while ( (rLine = bReader.readLine()) != null)
{
String tmp_rLine = rLine;
s.append(tmp_rLine+"\n");
}
tmp_rLine = null;
in.close();
return s.toString();
}
public static void main(String[] args) {
TestDataParsing2 tdp = new TestDataParsing2();
tdp.initdb();
System.out.println("process started");
String urlPath = "file:///C:/Users/Abhinav/Downloads/Compressed/eng-hindi-dict-
utf8/sa.htm";
String content = tdp.download(urlPath);
tdp.process(content);
tdp.closedb();
}
}
其他所有细节都正常工作。非常感谢任何有用的帮助。