1 mysql5.1 创建表
-- 创建表 id,开奖时间, CREATE TABLE `ssq` ( `id` bigint(20) NOT NULL AUTO_INCREMENT, `opendate` varchar(20) DEFAULT NULL, `openseq` varchar(20) DEFAULT NULL, `winningnum` varchar(40) DEFAULT NULL, `winningred1` varchar(2) DEFAULT NULL, `winningred2` varchar(2) DEFAULT NULL, `winningred3` varchar(2) DEFAULT NULL, `winningred4` varchar(2) DEFAULT NULL, `winningred5` varchar(2) DEFAULT NULL, `winningred6` varchar(2) DEFAULT NULL, `winningblue` varchar(2) DEFAULT NULL, `winningnumber` tinyint(10) DEFAULT NULL, `province` varchar(60) DEFAULT NULL, `secondwinnum` int(10) DEFAULT NULL, PRIMARY KEY (`id`), KEY `ssq_id_pk` (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
2 抓取,抓取数据,插入表中。需要的jar包
commons-logging-1.1.1.jar
htmlparser.jar
httpclient-4.2.3.jar
httpcore-4.2.2.jar
mysql-connector-java-5.1.14.jar
原先的Httpclient 现在改名为HttpComponents,可去http://hc.apache.org/ 下载,htmlparser 可去http://sourceforge.net/projects/htmlparser/files/htmlparser/1.6/htmlparser1_6_20060610.zip/download 下载
import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.sql.Statement; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableRow; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; public class DoubleBall { HttpClient httpclient = new DefaultHttpClient(); public static void main(String[] args) { Connection conn = null; Statement statement = null; try { Class.forName("com.mysql.jdbc.Driver"); conn = DriverManager.getConnection( "jdbc:mysql://localhost:3306/caipiao?rewriteBatchedStatements=true", "root", "root"); conn.setAutoCommit(false); statement = conn.createStatement(); DoubleBall ball = new DoubleBall(); for (int i = 1; i < 75; i++) { // System.out.println("-- 第" + i + "页双色球数据"); String url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_" + i + ".html"; String content = ball.getInternetConent(url); ball.getDoubleBallData(content,statement); } statement.executeBatch(); conn.commit(); System.out.println("结束"); } catch (Exception e) { System.out.println(e.getMessage()); }finally{ try { statement.close(); conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } private void getDoubleBallData(String content,Statement statement) { Parser parser = Parser.createParser(content, "UTF-8"); try { NodeList nodeList = parser .extractAllNodesThatMatch(new NodeFilter() { public boolean accept(Node node) { if (node instanceof TableTag)// return true; return false; } }); if(nodeList.size()>0){ for(int i = 0; i < nodeList.size(); ++i){ if(nodeList.elementAt(i) instanceof TableTag){ TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 2; j < rows.length-1; ++j) { // 最后一个是分页信息,只要数据 String sql = "insert into ssq(opendate,openseq,winningnum,winningred1,winningred2,winningred3,winningred4,winningred5,winningred6,winningblue,winningnumber,province,secondwinnum) values("; TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); for (int k = 0; k < columns.length; ++k) { String info = columns[k].toPlainTextString().trim(); if(k==0){ //System.out.print("日期:"+info+" "); sql+="\'"+info+"\',"; } if(k==1){ //System.out.print("期号:"+info+" "); sql+="\'"+info+"\',"; } if(k==2){ String[] sarr = info.split("\r\n "); String serc = ""; for (int l = 0; l < sarr.length; l++) { if(sarr.toString().trim().length()>0){ serc += sarr[l].trim()+" "; } } sql+="\'"+serc.trim()+"\',"; String[] winningnums = serc.trim().split(" "); for (int l = 0; l < winningnums.length; l++) { sql+="\'"+winningnums[l]+"\',"; } //System.out.print("中奖号码:"+serc+" "); } if(k==4){ //System.out.print("一等奖中奖注数:"+info.replace("\r\n", "")+" "); if(info!=null && info.length()>3){ String[] firstwiner = info.split("\r\n "); if(firstwiner.length>1){ sql+=""+firstwiner[0]+","; sql+="\'"+firstwiner[1]+"',"; } }else{ sql+="0,"; sql+="\'0\',"; } } if(k==5){ //System.out.print("二等奖中奖注数:"+info.replace("\n", "")+"\n"); if(info!=null && info.length()>0){ sql+=info.replace("\n", ""); }else{ sql+=0; } } } sql += ")"; statement.addBatch(sql); } } } } } catch (Exception e) { e.printStackTrace(); } } private String getInternetConent(String url){ String content = ""; try { // 生成一个请求对象 HttpGet httpget = new HttpGet(url); httpget.setHeader("referer", "http://www.baidu.com/"); // 执行请求 HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); if (entity != null) { //响应内容的长度 //long length = entity.getContentLength(); //响应内容 content = EntityUtils.toString(entity); } httpget.abort(); } catch (Exception e) { httpclient.getConnectionManager().shutdown(); } return content; } }
OK 因为数据比较大,程序执行需要时间,静静等待。