之前还以为从上至下统一用上UTF-8就高枕无忧了,哪知道今天在抓取新浪微博的数据的时候还是遇到字符的异常。
新浪微博返回的数据编码是unicode,而数据库用的是uft8,在转换的时候有些字符会得到4字节的utf8编码。
从新浪微博抓到的数据在入库的时候抛出异常:
Incorrect string value: '\xF0\x90\x8D\x83\xF0\x90...'
发现导致异常的字符不是繁体而是某种佛经文字。。。额滴神。。。但是按道理UTF-8应该能支持才对啊,他不是万能的么?
原来问题出在mysql上,mysql如果设置编码集为utf8那么它最多只能支持到3个字节的UTF-8编码,而4个字节的UTF-8字符还是存在的,这样一来如果你建表的时候用的utf8字符集出异常就理所当然了。
解决方法很简单,修改字段或者表的字符集为utf8mb4。
比较蛋疼的是,字符集utf8mb4在mysql 5.5.3之后才支持。。。
package weibo4j.sina;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.sql.BatchUpdateException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.List;
import weibo4j.entity.StockByCompanyName;
import weibo4j.model.WeiboException;
import weibo4j.org.json.JSONArray;
import weibo4j.org.json.JSONException;
import weibo4j.org.json.JSONObject;
import weibo4j.util.BareBonesBrowserLaunch;
import weibo4j.util.Cont;
import weibo4j.util.DBUtil;
import weibo4j.util.DatePaser;
import weibo4j.util.DeleteHtml;
import weibo4j.util.HavingCrawl;
import weibo4j.util.UnicodeToUTF8;
import weibo4j.util.WeiIDPaser;
import weibo4j.weiutils.Oauth;
import weibo4j.weiutils.Search;
public class Sina {
static String label = "server1";
static String access_token = "";
// static long p = 0; //返回记录的总页数
static int count = 50; //每页记录数,由于新浪每次只返回最新200条微博,建议此值设置为20或10的倍数,以方便后面计算;
static int searchWaitTime =3; //秒,每个关键词采集完后休眠时间,
static int allSearchWaitTime = 20 ;//分,所有关键词采集完后休眠时间,
static int successNum =0;//每批次插入数据的记录数
static String wei_platform ="新浪微博";
static String t_listed_company_dic = "t_listed_company_dic_sina";
static String t_sina_weibo_text = "t_sina_weibo_text";
public static String crawlTimeTable ="t_weibo_crawl_time_sina";//库表,用于判断某关键词微博是否已经采集过
public static void main(String[] args) {
Oauth oauth = new Oauth();
try {
BareBonesBrowserLaunch.openURL(oauth.authorize("code"));
System.out.print("enter code:");
BufferedReader br = new BufferedReader(new InputStreamReader(
System.in));
String code = br.readLine();
access_token = oauth.getAccessTokenByCode(code).getAccessToken();
// System.out.print("access_token");
} catch (WeiboException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
Sina.doCrawl();// 采集微博
}
public static void doCrawl() {
Search search = new Search();
search.client.setToken(access_token);
System.out.print("setToken access_token");
Connection conn = null;
PreparedStatement pstmt = null;
String sql = "insert into " + t_sina_weibo_text + "(wei_id,head,nick,wei_text,timestam,daydate,fro,wei_platform,company_name,search_name,company_code,isVerified) "
+ "values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?)";
while (true) {
try {
conn = DBUtil.getConnection();
pstmt = conn.prepareStatement(sql);
String last_companyName_sql = "select company_name from "
+ t_listed_company_dic
+ " where id =(select max(id) from "
+ t_listed_company_dic + " where label ='" + label
+ "' and parent_id=0)";
String lastCompanyName = Cont.get(last_companyName_sql);
String sql_company = "select company_name,company_code,category from "
+ t_listed_company_dic
+ " where parent_id=0 and label='" + label + "'";
List lists_companies = Cont
.getAllCompanyNames(sql_company);
String company_name = "";
String company_code = "";
String search_company = "";
String sinceTime ="";
for (int i = 0; i < lists_companies.size(); i++) {
StockByCompanyName parent_company = lists_companies.get(i);// 得到公司名
company_name = parent_company.getCompany_name();
company_code = parent_company.getCompany_code();
System.out.println("company_name:" + company_name + ","
+ company_code );
String sql_search = "select company_name from "
+ t_listed_company_dic
+ " where parent_id in (select id from "
+ t_listed_company_dic + " where company_name='"
+ company_name + "')";
List search_companies = Cont.getContent(sql_search,
"company_name");
for (int j = 0; j < search_companies.size(); j++) {
search_company = search_companies.get(j);// 例如:上证综合指数等公司搜索词
// try{
// JSONObject ss = search.searchTopics(search_company, count,1);//调用接口获取结果数量
// resultCount = Integer.parseInt(ss.getString("total_number"));//返回结果总数
// if(resultCount==0){
// break;
// }
// } catch (NumberFormatException e) {
// e.printStackTrace();
// } catch (JSONException e) {
// e.printStackTrace();
// } catch (WeiboException e) {
// e.printStackTrace();
// }
// p = resultCount%count==0?resultCount/count:resultCount/count+1;//获取总页数
// int allowMaxPages = 200%count==0?200/count:200/count+1;//假设所有搜索词返回记录数超200个,则理论上每个搜索词最大可采页数为allowMaxPages
for(int a =4;a >0;a--){ //循环采集,判断可采总页数,然后倒序页号采集
//循环采集,根据总页数和最大可采页数(新浪一次调用接口只返回最新200条)
JSONArray myJsonArray =null;
String timestam ="";
boolean isUpdate = false;//是否需要更新表中关键词最后采集时间;false为不更新,true为更新
try{
JSONObject sss = search.searchTopics(search_company, count,a);
String st = sss.getString("statuses");
System.out.println("返回内容: " + st);
System.out.println("返回内容长度: " + st.length());
if(st.length()<10){//st为空时只返回“[]”符号
System.out.println("搜索词" + search_company + "第 " + a +"页返回结果为空");
// System.out.println("本页返回结果为空");
continue;
}
myJsonArray = new JSONArray(st);
if((myJsonArray==null) || (myJsonArray.toString()=="")){
//发现当myJsonArray返回为空时不做处理会抛错并中断程序
//返回的错误代码:
/*
*weibo4j.model.WeiboException: A JSONObject text must begin with '{' at character 1:
*504 Gateway Time-out
*
*504 Gateway Time-out
*
nginx
*
*
*/
continue;
}
System.out.println("开始采集搜索词" + search_company + "第 " + a +"页内容");
//判断每页第一条微博发表时间(和表t_weibo_crawl_time对比)即可判断内容是否已采集
if(!HavingCrawl.isCrawlByPage(search_company, myJsonArray.getJSONObject(0).getString("created_at"))){
System.out.println("本页微博内容已经采集过!");
continue;
}
}catch (JSONException e) {
e.printStackTrace();
} catch (WeiboException e) {
e.printStackTrace();
}
// System.out.println(myJsonArray.length());
//返回的每页内容是按微博发表时间降序排列,故对返回的记录做倒序遍历,便于和时间表的时间对比
String tempTime = "";
sinceTime = Cont.getDateBySearchName("select sina_end_time from " +crawlTimeTable + " where search_name =\"" + search_company + "\"");
for (int ii = myJsonArray.length()-1; ii >=0; ii--) {
String wei_text = null;
String mid = null;
String fro = null;
String image = null;
String user_id = null;
String head = null;
String nick = null;
String isVerified = null;
try{
// 获取每一个JsonObject对象
JSONObject myjObject = myJsonArray.getJSONObject(ii);
// 获取每一个对象中的值
timestam = DatePaser.getDate(myjObject.getString("created_at"));
if(!HavingCrawl.isCrawl(sinceTime, timestam)){
continue;//返回的某页内某条微博内容创建时间早于表中记录,则跳出循环,不插入数据库
}else{
tempTime = timestam;
isUpdate =true;
}
wei_text = myjObject.getString("text");//text返回的编码格式是unicode,某些unicode字符转成utf8编码之后有4个字节,mysql 只能存放3个字节的UTF8
// UnicodeToUTF8.unicodeToUtf8(wei_text);
mid = myjObject.getString("mid");
fro = DeleteHtml.htmlFilter(myjObject.getString("source"));
image = myjObject.getString("thumbnail_pic");//缩略图片地址,没有时不返回此字段
//获取数组中对象的对象
JSONObject myjObject2 = myjObject.getJSONObject("user");
nick = myjObject2.getString("screen_name");
isVerified = myjObject2.getString("verified");
head = myjObject2.getString("profile_image_url");
user_id = myjObject2.getString("id");
}catch (JSONException e) {
e.printStackTrace();
}
pstmt.setString(1, user_id + WeiIDPaser.getWeiID(mid));
pstmt.setString(2, head);
pstmt.setString(3, nick);
pstmt.setString(4, wei_text);
pstmt.setString(5, timestam);
pstmt.setString(6, timestam.substring(0, 10));
pstmt.setString(7, fro);
pstmt.setString(8, wei_platform);
pstmt.setString(9, company_name);// 公司名
pstmt.setString(10, search_company);// 插入搜索名
pstmt.setString(11, company_code);// 插入公司代码
pstmt.setString(12, isVerified);
pstmt.addBatch();
try{
successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数
System.out.println(sql);
System.out.println("成功插入到数据库数: " + successNum);
if(isUpdate){
Cont.updateEndTime(timestam, search_company, crawlTimeTable);
System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company);
}
}catch(BatchUpdateException e){
e.printStackTrace();
}
}
// successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数
// System.out.println(sql);
// System.out.println("成功插入到数据库数: " + successNum);
// if(isUpdate){
// Cont.updateEndTime(timestam, search_company, crawlTimeTable);
// System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company);
// }
System.out.println("关键词 " + search_company + "第 " + a + "采集完毕,休眠" + searchWaitTime + "秒钟后继续");
Thread.sleep(searchWaitTime*1000);//每个关键词采集完后休眠时间
}
}
if (company_name.equals(lastCompanyName)) {
System.out.println("所有关键词采集完毕,休眠" + allSearchWaitTime + "分钟后继续轮循采集...");
Thread.sleep(allSearchWaitTime* 60 * 1000);
}
}
pstmt.close();
conn.close();
}catch (SQLException e) {
e.printStackTrace();
}catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
之前是把下面插入表的代码放在for循环外面,由于中途碰上4字节字条抛错经常导致程序中断或没采几个关键词就中断重头来,后面把它们放在 for循环内后,才解决,把抛异常限制在for循环内
try{
successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数
System.out.println(sql);
System.out.println("成功插入到数据库数: " + successNum);
if(isUpdate){
Cont.updateEndTime(timestam, search_company, crawlTimeTable);
System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company);
}
}catch(BatchUpdateException e){
e.printStackTrace();
}