双色球数据抓入Mysql

1 mysql5.1 创建表

--  创建表  id,开奖时间,
CREATE TABLE `ssq` (
  `id` bigint(20) NOT NULL AUTO_INCREMENT,
  `opendate` varchar(20) DEFAULT NULL,
  `openseq` varchar(20) DEFAULT NULL,
  `winningnum` varchar(40) DEFAULT NULL,
  `winningred1` varchar(2) DEFAULT NULL,
  `winningred2` varchar(2) DEFAULT NULL,
  `winningred3` varchar(2) DEFAULT NULL,
  `winningred4` varchar(2) DEFAULT NULL,
  `winningred5` varchar(2) DEFAULT NULL,
  `winningred6` varchar(2) DEFAULT NULL,
  `winningblue` varchar(2) DEFAULT NULL,
  `winningnumber` tinyint(10) DEFAULT NULL,
  `province` varchar(60) DEFAULT NULL,
  `secondwinnum` int(10) DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `ssq_id_pk` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

 

2 抓取,抓取数据,插入表中。需要的jar包

commons-logging-1.1.1.jar

htmlparser.jar

httpclient-4.2.3.jar

httpcore-4.2.2.jar

mysql-connector-java-5.1.14.jar

原先的Httpclient 现在改名为HttpComponents,可去http://hc.apache.org/ 下载,htmlparser 可去http://sourceforge.net/projects/htmlparser/files/htmlparser/1.6/htmlparser1_6_20060610.zip/download 下载

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
public class DoubleBall {
	HttpClient httpclient = new DefaultHttpClient();    
	public static void main(String[] args) {
		Connection conn = null;
		Statement statement  = null;
		try {
			Class.forName("com.mysql.jdbc.Driver");
			conn = DriverManager.getConnection(
					"jdbc:mysql://localhost:3306/caipiao?rewriteBatchedStatements=true", "root", "root");
			conn.setAutoCommit(false);
			statement = conn.createStatement();
			DoubleBall ball = new DoubleBall();
			for (int i = 1; i < 75; i++) {
				// System.out.println("-- 第" + i + "页双色球数据");
				String url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_" + i
						+ ".html";
				String content = ball.getInternetConent(url);
				ball.getDoubleBallData(content,statement);
			}
			statement.executeBatch();
			conn.commit();
			System.out.println("结束");
		} catch (Exception e) {
			System.out.println(e.getMessage());
		}finally{
			try {
				statement.close();
				conn.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}  
	private void getDoubleBallData(String content,Statement statement) {
		 Parser parser = Parser.createParser(content, "UTF-8");
         try {
			NodeList nodeList = parser  
			     .extractAllNodesThatMatch(new NodeFilter() {  
			         public boolean accept(Node node) {  
			             if (node instanceof TableTag)// 
			                 return true;  
			             return false;  
			         }  
			});
			if(nodeList.size()>0){  
				for(int i = 0; i < nodeList.size(); ++i){  
		            if(nodeList.elementAt(i) instanceof TableTag){  
		                TableTag tag = (TableTag) nodeList.elementAt(i);  
		                TableRow[] rows = tag.getRows();  
		                for (int j = 2; j < rows.length-1; ++j) {  // 最后一个是分页信息,只要数据
		                	String sql = "insert into ssq(opendate,openseq,winningnum,winningred1,winningred2,winningred3,winningred4,winningred5,winningred6,winningblue,winningnumber,province,secondwinnum) values(";
		                	TableRow row = (TableRow) rows[j];  
		                    TableColumn[] columns = row.getColumns();  
		                    for (int k = 0; k < columns.length; ++k) {
		                    	String info = columns[k].toPlainTextString().trim();  
		                    	if(k==0){
		                           //System.out.print("日期:"+info+"   "); 
		                           sql+="\'"+info+"\',";
		                    	}
		                    	if(k==1){
			                           //System.out.print("期号:"+info+"   "); 
		                    		 sql+="\'"+info+"\',";
			                    }
		                    	if(k==2){
		                    		  String[] sarr = info.split("\r\n          ");
		                    		  String serc = "";
		                    		  for (int l = 0; l < sarr.length; l++) {
										 if(sarr.toString().trim().length()>0){
											 serc += sarr[l].trim()+" ";
										 }
									  }
		                    		  sql+="\'"+serc.trim()+"\',";
		                    		  String[] winningnums = serc.trim().split("  ");
		                    		  for (int l = 0; l < winningnums.length; l++) {
		                    			  sql+="\'"+winningnums[l]+"\',";
		                    		  }
			                          //System.out.print("中奖号码:"+serc+"   ");  
			                    }
		                    	if(k==4){
			                           //System.out.print("一等奖中奖注数:"+info.replace("\r\n", "")+"   ");  
		                    		if(info!=null && info.length()>3){
		                    			String[] firstwiner = info.split("\r\n          ");
			                    		if(firstwiner.length>1){
			                    			sql+=""+firstwiner[0]+",";
				                    		sql+="\'"+firstwiner[1]+"',";
			                    		}
		                    		}else{
		                    			sql+="0,";
			                    		sql+="\'0\',";
		                    		}
		                    		 
		                    	}
		                    	if(k==5){
		                    		//System.out.print("二等奖中奖注数:"+info.replace("\n", "")+"\n");  
		                    		if(info!=null && info.length()>0){
		                    			sql+=info.replace("\n", "");
		                    		}else{
		                    			sql+=0;
		                    		}
		                    	}
		                    }
		                    sql += ")";
		                    statement.addBatch(sql);
		                } 
		            } 
		        }  
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	private String getInternetConent(String url){
		String content  = "";
 	    try {  
	        // 生成一个请求对象    
	        HttpGet httpget = new HttpGet(url);    
	        httpget.setHeader("referer", "http://www.baidu.com/");    
	        // 执行请求   
	        HttpResponse response = httpclient.execute(httpget);    
	        HttpEntity entity = response.getEntity(); 
	        if (entity != null) {  
                //响应内容的长度  
                //long length = entity.getContentLength();  
                //响应内容  
                content = EntityUtils.toString(entity);  
            }  
	        httpget.abort();  
	    } catch (Exception e) {  
	    	httpclient.getConnectionManager().shutdown();  
	    }  
 	    return content;
	}
}

 OK 因为数据比较大,程序执行需要时间,静静等待。

 

你可能感兴趣的:(mysql)