网页抓取文字

package com.smilezl.scrapy;


import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.UnsupportedEncodingException;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.URL;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

import java.util.ArrayList;

import java.util.List;

import java.util.Timer;

import java.util.TimerTask;

import java.util.regex.Matcher;

import java.util.regex.Pattern;


import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpStatus;

import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.log4j.Logger;


public class ScrapyGet {

private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();


private static final Logger log = Logger.getLogger(ScrapyGet.class);

private static int connectionTimeOut = 15000;

private static int socketTimeOut = 15000;

private static int readTimeOut = 20000;

private static int maxConnectionPerHost = 4;

private static int maxTotalConnections = 40;

private static boolean initialed = false;

private static int minContentLength = 4;

private static int maxContentLength = 400;

//定时任务

public static boolean isRunning = false;

private static long TIMEDELAY = 10 * 1000;

public static void SetPara() {

connectionManager.getParams().setConnectionTimeout(connectionTimeOut);

       connectionManager.getParams().setSoTimeout(socketTimeOut);

       connectionManager.getParams().setDefaultMaxConnectionsPerHost(

               maxConnectionPerHost);

       connectionManager.getParams().setMaxTotalConnections(maxTotalConnections);

//        connectionManager.get

       initialed = true;

}

public static String getResponseByGetMethod(String htmlUrl){

String charSet = "gbk";

try {

System.out.println("抓取链接: " + htmlUrl);

URL url = new URL(htmlUrl);

HttpURLConnection connection = (HttpURLConnection) url.openConnection();

connection.setDoOutput(true);

String contenttype = connection.getContentType();

charSet = getCharset(contenttype);

//System.out.println("charSet=" + charSet);

if (charSet == null)

charSet = "gbk";

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

return charSet;

}

//        return getResponseByGetMethod(url,"UTF-8");

       return getResponseByGetMethod(htmlUrl,charSet);

   }

/**

* 获取网页编码方式

* @param str

* @return

*/

public static String getCharset(String str) {

if (str == "" || str == null)

return null;

Pattern pattern = Pattern.compile("charset=.*");

Matcher matcher = pattern.matcher(str);

if (matcher.find()) {

return matcher.group(0).split("charset=")[1];

}

return null;

}

public static String getResponseByGetMethod(String url,String charSet) {

       HttpClient client = new HttpClient(connectionManager);

       if (!initialed) {

           SetPara();

       }

       GetMethod getMethod = new GetMethod(url);

       getMethod.getParams().setSoTimeout(readTimeOut);

//        System.out.println("timeout = " + client.getHttpConnectionManager().getParams().getConnectionTimeout());

       StringBuffer contentBuffer = new StringBuffer();

       try {

           int statusCode = client.executeMethod(getMethod);

           if (statusCode == HttpStatus.SC_OK) {

               InputStream in = getMethod.getResponseBodyAsStream();

               BufferedReader reader = new BufferedReader(new InputStreamReader(in,getMethod.getResponseCharSet()));

               String inputLine = null;

               while((inputLine = reader.readLine()) != null){

                   contentBuffer.append(inputLine);

                   contentBuffer.append("/n");

               }

               in.close();

               return convertStringCode(contentBuffer.toString(),getMethod.getResponseCharSet(),charSet);

           }else{

               log.error(getMethod.getStatusLine());

           }


       } catch (IOException e) {

//            System.out.println("e.getMessage() = " + e.getMessage());

           log.error( e.getMessage());

           return "";

       }finally {

           getMethod.releaseConnection();

       }

       return "";

   }

private static String convertStringCode(String source, String srcEncode, String destEncode) {

 if (source != null && !"".equals(source)) {

  try {

   return new String(source.getBytes(srcEncode), destEncode);

  } catch (UnsupportedEncodingException e) {

   e.printStackTrace();

   return "";

  }

 } else {

  return "";

 }

}

/**

* @param content

*/

public static List<String> parserRegex(String content) {

String regex="([\u4e00-\u9fa5]+)";

Matcher matcher = Pattern.compile(regex).matcher(content);

List<String> list = new ArrayList<String>();

while(matcher.find() ){

//System.out.println(matcher.group(0));

String tmp =matcher.group(0);

if (tmp.length() >= minContentLength && tmp.length() <= maxContentLength)

list.add(matcher.group(0));

}

return list;

}

/**

* @param list

*/

public static void saveScrapyContent(List<String> list, UrlModel model) {

try {

Class.forName("org.postgresql.Driver").newInstance();

String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&amp;";

Connection con = DriverManager.getConnection(url, "postgres", "password");

Statement st = con.createStatement();


int seqid = 1;

String maxSql = "select max(seqid) as m from scrapycontent";

System.out.println("记录总数" + list.size());

ResultSet rs = (ResultSet) st.executeQuery(maxSql);

if (rs.next() && rs.getInt(1) != 0) {

seqid = rs.getInt(1);

}

//更新已经抓取的数据URL

String setUrlType = "update scrapyurl set type = 1 where url='" + model.getUrl() + "'";

st.executeUpdate(setUrlType);

for (int i = 0; i < list.size(); i++) {

String sql = "insert into scrapycontent(content,seqid,parentseqid) values('" + list.get(i) + "'," + (seqid+i+1) +","+model.getSeqid()+")";

st.execute(sql);

}

rs.close();

st.close();

con.close();

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* 获取所有网址

* @return

*/

public static List<UrlModel> queryScrapyUrl() {

List<UrlModel> list = new ArrayList<UrlModel>();

try {

Class.forName("org.postgresql.Driver").newInstance();

String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&amp;characterEncoding=gbk";

Connection con = DriverManager.getConnection(url, "postgres", "password");

Statement st = con.createStatement();

String sql = "select url,type,seqid from scrapyurl where type=0 order by url";

ResultSet rs = (ResultSet) st.executeQuery(sql);

UrlModel model;

while (rs.next() ) {

model = new UrlModel();

model.setUrl(rs.getString("url"));

model.setType(rs.getInt("type"));

model.setSeqid(rs.getInt("seqid"));

list.add(model);

//String setUrlType = "update scrapyurl set type = 1 where url='" + rs.getString("url") + "'";

//st.executeUpdate(setUrlType);

}

rs.close();

st.close();

con.close();

} catch (Exception e) {

e.printStackTrace();

return list;

}

return list;

}

public static void parserData(UrlModel model) {

String content = ScrapyGet.getResponseByGetMethod(model.getUrl());

//System.out.println(content);

List<String> list = parserRegex(content);

saveScrapyContent(list, model);

}

public static void parserScrapyUrlData() {

List<UrlModel> list = queryScrapyUrl();

for (int i = 0; i < list.size(); i++) {

parserData(list.get(i));

}

isRunning = false;

}

public static class MyTask extends TimerTask {


@Override

public void run() {

isRunning = true;

//处理所有数据

System.out.println("running");

parserScrapyUrlData();

}


}

private static Timer timer = new Timer();

public static void timerSchedule() {

System.out.println("定时开始");

if (!isRunning) {

System.out.println("执行任务");

MyTask task = new ScrapyGet.MyTask();

timer.schedule(task, TIMEDELAY, 10 * 1000);

}

}

/**

* 设置序号

*/

public static void setScrapyNum() {

try {

Class.forName("org.postgresql.Driver").newInstance();

String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&amp;characterEncoding=gbk";

Connection con = DriverManager.getConnection(url, "postgres", "password");

Statement st = con.createStatement();

String sql = "select url from scrapyurl where type=0 order by url";

ResultSet rs = (ResultSet) st.executeQuery(sql);

int i=1;

while (rs.next() ) {

String setUrlSeqid = "update scrapyurl set seqid=" + i + " where url='" + rs.getString("url") + "'";

st.executeUpdate(setUrlSeqid);

i++;

}

} catch (Exception e) {

e.printStackTrace();

}

}

public static void main(String args[]) {

//String url = "http://www.lizhi123.net/gaoxiaoyulu/90632.html";

//parserData(url);

//parserScrapyUrlData();

timerSchedule();

//setScrapyNum();

   }

}


你可能感兴趣的:(import,网页,package)