package com.smilezl.scrapy;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Timer;
import java.util.TimerTask;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.log4j.Logger;
public class ScrapyGet {
private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
private static final Logger log = Logger.getLogger(ScrapyGet.class);
private static int connectionTimeOut = 15000;
private static int socketTimeOut = 15000;
private static int readTimeOut = 20000;
private static int maxConnectionPerHost = 4;
private static int maxTotalConnections = 40;
private static boolean initialed = false;
private static int minContentLength = 4;
private static int maxContentLength = 400;
//定时任务
public static boolean isRunning = false;
private static long TIMEDELAY = 10 * 1000;
public static void SetPara() {
connectionManager.getParams().setConnectionTimeout(connectionTimeOut);
connectionManager.getParams().setSoTimeout(socketTimeOut);
connectionManager.getParams().setDefaultMaxConnectionsPerHost(
maxConnectionPerHost);
connectionManager.getParams().setMaxTotalConnections(maxTotalConnections);
// connectionManager.get
initialed = true;
}
public static String getResponseByGetMethod(String htmlUrl){
String charSet = "gbk";
try {
System.out.println("抓取链接: " + htmlUrl);
URL url = new URL(htmlUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
String contenttype = connection.getContentType();
charSet = getCharset(contenttype);
//System.out.println("charSet=" + charSet);
if (charSet == null)
charSet = "gbk";
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return charSet;
}
// return getResponseByGetMethod(url,"UTF-8");
return getResponseByGetMethod(htmlUrl,charSet);
}
/**
* 获取网页编码方式
* @param str
* @return
*/
public static String getCharset(String str) {
if (str == "" || str == null)
return null;
Pattern pattern = Pattern.compile("charset=.*");
Matcher matcher = pattern.matcher(str);
if (matcher.find()) {
return matcher.group(0).split("charset=")[1];
}
return null;
}
public static String getResponseByGetMethod(String url,String charSet) {
HttpClient client = new HttpClient(connectionManager);
if (!initialed) {
SetPara();
}
GetMethod getMethod = new GetMethod(url);
getMethod.getParams().setSoTimeout(readTimeOut);
// System.out.println("timeout = " + client.getHttpConnectionManager().getParams().getConnectionTimeout());
StringBuffer contentBuffer = new StringBuffer();
try {
int statusCode = client.executeMethod(getMethod);
if (statusCode == HttpStatus.SC_OK) {
InputStream in = getMethod.getResponseBodyAsStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(in,getMethod.getResponseCharSet()));
String inputLine = null;
while((inputLine = reader.readLine()) != null){
contentBuffer.append(inputLine);
contentBuffer.append("/n");
}
in.close();
return convertStringCode(contentBuffer.toString(),getMethod.getResponseCharSet(),charSet);
}else{
log.error(getMethod.getStatusLine());
}
} catch (IOException e) {
// System.out.println("e.getMessage() = " + e.getMessage());
log.error( e.getMessage());
return "";
}finally {
getMethod.releaseConnection();
}
return "";
}
private static String convertStringCode(String source, String srcEncode, String destEncode) {
if (source != null && !"".equals(source)) {
try {
return new String(source.getBytes(srcEncode), destEncode);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return "";
}
} else {
return "";
}
}
/**
* @param content
*/
public static List<String> parserRegex(String content) {
String regex="([\u4e00-\u9fa5]+)";
Matcher matcher = Pattern.compile(regex).matcher(content);
List<String> list = new ArrayList<String>();
while(matcher.find() ){
//System.out.println(matcher.group(0));
String tmp =matcher.group(0);
if (tmp.length() >= minContentLength && tmp.length() <= maxContentLength)
list.add(matcher.group(0));
}
return list;
}
/**
* @param list
*/
public static void saveScrapyContent(List<String> list, UrlModel model) {
try {
Class.forName("org.postgresql.Driver").newInstance();
String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&";
Connection con = DriverManager.getConnection(url, "postgres", "password");
Statement st = con.createStatement();
int seqid = 1;
String maxSql = "select max(seqid) as m from scrapycontent";
System.out.println("记录总数" + list.size());
ResultSet rs = (ResultSet) st.executeQuery(maxSql);
if (rs.next() && rs.getInt(1) != 0) {
seqid = rs.getInt(1);
}
//更新已经抓取的数据URL
String setUrlType = "update scrapyurl set type = 1 where url='" + model.getUrl() + "'";
st.executeUpdate(setUrlType);
for (int i = 0; i < list.size(); i++) {
String sql = "insert into scrapycontent(content,seqid,parentseqid) values('" + list.get(i) + "'," + (seqid+i+1) +","+model.getSeqid()+")";
st.execute(sql);
}
rs.close();
st.close();
con.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 获取所有网址
* @return
*/
public static List<UrlModel> queryScrapyUrl() {
List<UrlModel> list = new ArrayList<UrlModel>();
try {
Class.forName("org.postgresql.Driver").newInstance();
String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&characterEncoding=gbk";
Connection con = DriverManager.getConnection(url, "postgres", "password");
Statement st = con.createStatement();
String sql = "select url,type,seqid from scrapyurl where type=0 order by url";
ResultSet rs = (ResultSet) st.executeQuery(sql);
UrlModel model;
while (rs.next() ) {
model = new UrlModel();
model.setUrl(rs.getString("url"));
model.setType(rs.getInt("type"));
model.setSeqid(rs.getInt("seqid"));
list.add(model);
//String setUrlType = "update scrapyurl set type = 1 where url='" + rs.getString("url") + "'";
//st.executeUpdate(setUrlType);
}
rs.close();
st.close();
con.close();
} catch (Exception e) {
e.printStackTrace();
return list;
}
return list;
}
public static void parserData(UrlModel model) {
String content = ScrapyGet.getResponseByGetMethod(model.getUrl());
//System.out.println(content);
List<String> list = parserRegex(content);
saveScrapyContent(list, model);
}
public static void parserScrapyUrlData() {
List<UrlModel> list = queryScrapyUrl();
for (int i = 0; i < list.size(); i++) {
parserData(list.get(i));
}
isRunning = false;
}
public static class MyTask extends TimerTask {
@Override
public void run() {
isRunning = true;
//处理所有数据
System.out.println("running");
parserScrapyUrlData();
}
}
private static Timer timer = new Timer();
public static void timerSchedule() {
System.out.println("定时开始");
if (!isRunning) {
System.out.println("执行任务");
MyTask task = new ScrapyGet.MyTask();
timer.schedule(task, TIMEDELAY, 10 * 1000);
}
}
/**
* 设置序号
*/
public static void setScrapyNum() {
try {
Class.forName("org.postgresql.Driver").newInstance();
String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&characterEncoding=gbk";
Connection con = DriverManager.getConnection(url, "postgres", "password");
Statement st = con.createStatement();
String sql = "select url from scrapyurl where type=0 order by url";
ResultSet rs = (ResultSet) st.executeQuery(sql);
int i=1;
while (rs.next() ) {
String setUrlSeqid = "update scrapyurl set seqid=" + i + " where url='" + rs.getString("url") + "'";
st.executeUpdate(setUrlSeqid);
i++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String args[]) {
//String url = "http://www.lizhi123.net/gaoxiaoyulu/90632.html";
//parserData(url);
//parserScrapyUrlData();
timerSchedule();
//setScrapyNum();
}
}