异常处理:使用UTF-8编码Mysql仍然抛出Incorrect string value异常

之前还以为从上至下统一用上UTF-8就高枕无忧了,哪知道今天在抓取新浪微博的数据的时候还是遇到字符的异常。

新浪微博返回的数据编码是unicode,而数据库用的是uft8,在转换的时候有些字符会得到4字节的utf8编码。

从新浪微博抓到的数据在入库的时候抛出异常:

Incorrect string value: '\xF0\x90\x8D\x83\xF0\x90...' 

发现导致异常的字符不是繁体而是某种佛经文字。。。额滴神。。。但是按道理UTF-8应该能支持才对啊,他不是万能的么?


原来问题出在mysql上,mysql如果设置编码集为utf8那么它最多只能支持到3个字节的UTF-8编码,而4个字节的UTF-8字符还是存在的,这样一来如果你建表的时候用的utf8字符集出异常就理所当然了。


解决方法很简单,修改字段或者表的字符集为utf8mb4。


比较蛋疼的是,字符集utf8mb4在mysql 5.5.3之后才支持。。。



package weibo4j.sina;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.sql.BatchUpdateException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.List;

import weibo4j.entity.StockByCompanyName;
import weibo4j.model.WeiboException;
import weibo4j.org.json.JSONArray;
import weibo4j.org.json.JSONException;
import weibo4j.org.json.JSONObject;
import weibo4j.util.BareBonesBrowserLaunch;
import weibo4j.util.Cont;
import weibo4j.util.DBUtil;
import weibo4j.util.DatePaser;
import weibo4j.util.DeleteHtml;
import weibo4j.util.HavingCrawl;
import weibo4j.util.UnicodeToUTF8;
import weibo4j.util.WeiIDPaser;
import weibo4j.weiutils.Oauth;
import weibo4j.weiutils.Search;

public class Sina {


	static String label = "server1";
	static String access_token = "";
//	static long p = 0;  //返回记录的总页数
	static int count = 50;     //每页记录数,由于新浪每次只返回最新200条微博,建议此值设置为20或10的倍数,以方便后面计算;
	static int searchWaitTime =3;   //秒,每个关键词采集完后休眠时间,
	static int allSearchWaitTime = 20 ;//分,所有关键词采集完后休眠时间,
	static int successNum =0;//每批次插入数据的记录数
	static String wei_platform ="新浪微博";
	static String t_listed_company_dic = "t_listed_company_dic_sina";
	static String t_sina_weibo_text = "t_sina_weibo_text";
	public static String crawlTimeTable ="t_weibo_crawl_time_sina";//库表,用于判断某关键词微博是否已经采集过

	public static void main(String[] args) {
		Oauth oauth = new Oauth();
		try {
			BareBonesBrowserLaunch.openURL(oauth.authorize("code"));
			System.out.print("enter code:");
			BufferedReader br = new BufferedReader(new InputStreamReader(
					System.in));
			String code = br.readLine();
			access_token = oauth.getAccessTokenByCode(code).getAccessToken();
//			System.out.print("access_token");
		} catch (WeiboException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		Sina.doCrawl();// 采集微博

	}

	public static void doCrawl() {
		Search search = new Search();
		search.client.setToken(access_token);
		System.out.print("setToken access_token");
		
		Connection conn = null;
		PreparedStatement pstmt = null;
		
		String sql = "insert into " + t_sina_weibo_text + "(wei_id,head,nick,wei_text,timestam,daydate,fro,wei_platform,company_name,search_name,company_code,isVerified) "
				+ "values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?)";


		while (true) {
			try {
				conn = DBUtil.getConnection();
				pstmt = conn.prepareStatement(sql);
				String last_companyName_sql = "select company_name from "
						+ t_listed_company_dic
						+ " where id =(select max(id) from "
						+ t_listed_company_dic + " where label ='" + label
						+ "' and parent_id=0)";
				String lastCompanyName = Cont.get(last_companyName_sql);

				String sql_company = "select company_name,company_code,category from "
						+ t_listed_company_dic
						+ " where parent_id=0 and label='" + label + "'";
				List lists_companies = Cont
						.getAllCompanyNames(sql_company);
				String company_name = "";
				String company_code = "";
				String search_company = "";
				String sinceTime ="";

				for (int i = 0; i < lists_companies.size(); i++) {
					StockByCompanyName parent_company = lists_companies.get(i);// 得到公司名
					company_name = parent_company.getCompany_name();
					company_code = parent_company.getCompany_code();
					System.out.println("company_name:" + company_name + ","
							+ company_code );
					String sql_search = "select company_name from "
							+ t_listed_company_dic
							+ " where parent_id in (select id from "
							+ t_listed_company_dic + " where company_name='"
							+ company_name + "')";
					List search_companies = Cont.getContent(sql_search,
							"company_name");

					for (int j = 0; j < search_companies.size(); j++) {

						search_company = search_companies.get(j);// 例如:上证综合指数等公司搜索词
//						try{
//							JSONObject ss = search.searchTopics(search_company, count,1);//调用接口获取结果数量
//							resultCount = Integer.parseInt(ss.getString("total_number"));//返回结果总数
//							if(resultCount==0){
//								break;
//							}
//						} catch (NumberFormatException e) {
//							e.printStackTrace();
//						} catch (JSONException e) {
//							e.printStackTrace();
//						} catch (WeiboException e) {
//							e.printStackTrace();
//						} 

					
//						p =  resultCount%count==0?resultCount/count:resultCount/count+1;//获取总页数

//						int allowMaxPages = 200%count==0?200/count:200/count+1;//假设所有搜索词返回记录数超200个,则理论上每个搜索词最大可采页数为allowMaxPages
						for(int a =4;a >0;a--){           //循环采集,判断可采总页数,然后倒序页号采集
							//循环采集,根据总页数和最大可采页数(新浪一次调用接口只返回最新200条)
							JSONArray myJsonArray =null;
							String timestam ="";
							boolean isUpdate = false;//是否需要更新表中关键词最后采集时间;false为不更新,true为更新
							try{
								JSONObject sss = search.searchTopics(search_company, count,a);
								
								String st = sss.getString("statuses");
								
								System.out.println("返回内容: " + st);
								System.out.println("返回内容长度: " + st.length());
								if(st.length()<10){//st为空时只返回“[]”符号
									System.out.println("搜索词" + search_company + "第 " + a +"页返回结果为空");
//									System.out.println("本页返回结果为空");
									continue;
								}
	
								myJsonArray = new JSONArray(st);
								if((myJsonArray==null) || (myJsonArray.toString()=="")){
									//发现当myJsonArray返回为空时不做处理会抛错并中断程序
									//返回的错误代码:
									/*
									 *weibo4j.model.WeiboException: A JSONObject text must begin with '{' at character 1:
									 *504 Gateway Time-out
									 *
									 *

504 Gateway Time-out

*
nginx
* * */ continue; } System.out.println("开始采集搜索词" + search_company + "第 " + a +"页内容"); //判断每页第一条微博发表时间(和表t_weibo_crawl_time对比)即可判断内容是否已采集 if(!HavingCrawl.isCrawlByPage(search_company, myJsonArray.getJSONObject(0).getString("created_at"))){ System.out.println("本页微博内容已经采集过!"); continue; } }catch (JSONException e) { e.printStackTrace(); } catch (WeiboException e) { e.printStackTrace(); } // System.out.println(myJsonArray.length()); //返回的每页内容是按微博发表时间降序排列,故对返回的记录做倒序遍历,便于和时间表的时间对比 String tempTime = ""; sinceTime = Cont.getDateBySearchName("select sina_end_time from " +crawlTimeTable + " where search_name =\"" + search_company + "\""); for (int ii = myJsonArray.length()-1; ii >=0; ii--) { String wei_text = null; String mid = null; String fro = null; String image = null; String user_id = null; String head = null; String nick = null; String isVerified = null; try{ // 获取每一个JsonObject对象 JSONObject myjObject = myJsonArray.getJSONObject(ii); // 获取每一个对象中的值 timestam = DatePaser.getDate(myjObject.getString("created_at")); if(!HavingCrawl.isCrawl(sinceTime, timestam)){ continue;//返回的某页内某条微博内容创建时间早于表中记录,则跳出循环,不插入数据库 }else{ tempTime = timestam; isUpdate =true; } wei_text = myjObject.getString("text");//text返回的编码格式是unicode,某些unicode字符转成utf8编码之后有4个字节,mysql 只能存放3个字节的UTF8 // UnicodeToUTF8.unicodeToUtf8(wei_text); mid = myjObject.getString("mid"); fro = DeleteHtml.htmlFilter(myjObject.getString("source")); image = myjObject.getString("thumbnail_pic");//缩略图片地址,没有时不返回此字段 //获取数组中对象的对象 JSONObject myjObject2 = myjObject.getJSONObject("user"); nick = myjObject2.getString("screen_name"); isVerified = myjObject2.getString("verified"); head = myjObject2.getString("profile_image_url"); user_id = myjObject2.getString("id"); }catch (JSONException e) { e.printStackTrace(); } pstmt.setString(1, user_id + WeiIDPaser.getWeiID(mid)); pstmt.setString(2, head); pstmt.setString(3, nick); pstmt.setString(4, wei_text); pstmt.setString(5, timestam); pstmt.setString(6, timestam.substring(0, 10)); pstmt.setString(7, fro); pstmt.setString(8, wei_platform); pstmt.setString(9, company_name);// 公司名 pstmt.setString(10, search_company);// 插入搜索名 pstmt.setString(11, company_code);// 插入公司代码 pstmt.setString(12, isVerified); pstmt.addBatch(); try{ successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数 System.out.println(sql); System.out.println("成功插入到数据库数: " + successNum); if(isUpdate){ Cont.updateEndTime(timestam, search_company, crawlTimeTable); System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company); } }catch(BatchUpdateException e){ e.printStackTrace(); } } // successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数 // System.out.println(sql); // System.out.println("成功插入到数据库数: " + successNum); // if(isUpdate){ // Cont.updateEndTime(timestam, search_company, crawlTimeTable); // System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company); // } System.out.println("关键词 " + search_company + "第 " + a + "采集完毕,休眠" + searchWaitTime + "秒钟后继续"); Thread.sleep(searchWaitTime*1000);//每个关键词采集完后休眠时间 } } if (company_name.equals(lastCompanyName)) { System.out.println("所有关键词采集完毕,休眠" + allSearchWaitTime + "分钟后继续轮循采集..."); Thread.sleep(allSearchWaitTime* 60 * 1000); } } pstmt.close(); conn.close(); }catch (SQLException e) { e.printStackTrace(); }catch (InterruptedException e) { e.printStackTrace(); } } } }



之前是把下面插入表的代码放在for循环外面,由于中途碰上4字节字条抛错经常导致程序中断或没采几个关键词就中断重头来,后面把它们放在 for循环内后,才解决,把抛异常限制在for循环内


try{  
                                    successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数  
                                    System.out.println(sql);  
                                    System.out.println("成功插入到数据库数: " + successNum);   
                                    if(isUpdate){  
                                        Cont.updateEndTime(timestam, search_company, crawlTimeTable);  
                                        System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company);  
                                    }  
                                }catch(BatchUpdateException e){  
                                    e.printStackTrace();  
                                }  



你可能感兴趣的:(java,mysql/mariadb)