版权声明:本文地址http://blog.csdn.net/caib1109/article/details/51518790
欢迎非商业目的的转载, 作者保留一切权利
网络有很多信息, 比如以”爬虫”为关键字搜索, 获得1,000,000条结果, 不可能靠人工去检测哪些信息是需要的.
所以爬虫的目的, 就是自动获得网页内容并保存有用信息.
具体的,
@Repository
public class RewardsTestDatabase {
@Value("#{jdbcProperties.databaseName}")
public void setDatabaseName(String dbName) { … }
@Value("#{jdbcProperties.databaseKeyGenerator}")
public void setKeyGenerator(KeyGenerator kg) { … }
}
其中, “jdbcProperties”是在applicationContext.xml中配置的
<!--src目录下的jdbcProperties.properties-->
<bean id="config" class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer">
<property name="fileEncoding" value="UTF-8"></property>
<property name="locations">
<list>
<value>classpath:jdbcProperties.properties</value>
</list>
</property>
</bean>
术语:
Spring Expression Language - “#{strategyBean.databaseKeyGenerator}”. Spring EL是Spring 3的新特性.
依赖包:
apache.httpclient4.5.2 - HttpGet, HttpPost
apache.httpcore4.4 - BasicNameValuePair implements NameValuePair
详细用法请参考wangpeng047@CSDN的大作, 内容全且准.
下面是我写的GET/POST请求
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
import org.apache.http.message.BasicHeader;
import org.apache.http.util.EntityUtils;
public class HttpRequestTool {
private static HttpHost proxy;
/** * set Proxy for httpclient * * @param proxyHost * 127.0.0.1 * @param port * 8080 * @return */
public static boolean setProxy(String proxyHost,String port){
if(proxyHost==null || port == null)
return false;
proxyHost=proxyHost.trim();
port=port.trim();
/* * 0-9 \\d 进行匹配 * 10-99 [1-9]\\d 进行匹配 * 100-199 1\\d\\d 进行匹配 * 200-249 2[0-4]\\d 进行匹配 * 250-255 25[0-5] 进行匹配 * (xxx|xxx|xxx|xxx)逻辑或, ^xxx$全字符串匹配 * ^(\\d|[1-9]\\d|1\\d\\d|2[0-4]\\d|25[0-5].){3}(\\d|[1-9]\\d|1\\d\\d|2[0-4]\\d|25[0-5])$ */
if(!Pattern.compile("^((\\d|[1-9]\\d|1\\d\\d|2[0-4]\\d|25[0-5]).){3}(\\d|[1-9]\\d|1\\d\\d|2[0-4]\\d|25[0-5])$").matcher(proxyHost).find())
return false;
if(Pattern.compile("[^\\d]").matcher(port).find())
return false;
int iPort = Integer.parseInt(port);
if(iPort>65535)
return false;
proxy = new HttpHost(proxyHost, iPort);
return true;
}
/** * simple getMethod without headers and parameters. * * @param host * @param resourcePath * @return * @throws URISyntaxException * @throws IOException */
public static String getMethod(String host, String resourcePath) throws URISyntaxException, IOException{
return getMethod("http", host, null, resourcePath, null, null);
}
/** * getMethod with headers and parameters. * * @param protocol * @param host * @param port * @param resourcePath * @param headKeyValueArray * @param paraKeyValueList * @return * @throws URISyntaxException * @throws IOException */
public static String getMethod(String protocol, String host, String port, String resourcePath, Header[] headKeyValueArray, List<NameValuePair> paraKeyValueList)
throws URISyntaxException, IOException {
URIBuilder builder = new URIBuilder().setScheme(protocol).setHost(host);
if(port!=null)
builder.setPort(Integer.parseInt(port));
if(resourcePath!=null)
builder.setPath("/" + resourcePath);
//Get请求参数
if(paraKeyValueList!=null)
builder.addParameters(paraKeyValueList); //中文参数自动转为utf-8
//不要用已经过时的httpGet.setParams(HetpParams params)方法
URI uri = builder.build();
HttpGet httpGet = new HttpGet(uri);
if (headKeyValueArray != null)
httpGet.setHeaders(headKeyValueArray);
CloseableHttpClient httpclient = (proxy==null)?
HttpClients.createDefault()
:
HttpClients.custom().setRoutePlanner(new DefaultProxyRoutePlanner(proxy)).build();
BufferedReader br = null;
InputStreamReader isr = null;
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpclient.execute(httpGet);
System.out.println(httpResponse.getStatusLine());
HttpEntity bodyEntity = httpResponse.getEntity();
isr = new InputStreamReader(bodyEntity.getContent());
br = new BufferedReader(isr);
StringBuffer httpBody = new StringBuffer();
String resTemp = "";
while ((resTemp = br.readLine()) != null) {
resTemp = resTemp.trim();
if (!"".equals(resTemp))
httpBody.append(resTemp.trim()).append("\n");
}
EntityUtils.consume(bodyEntity);
return httpBody.toString();
} finally {
try {
if (httpResponse != null)
httpResponse.close();
} catch (IOException e1) {
e1.printStackTrace();
}
if (isr != null) {
try {
isr.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/** * 版权声明:本文地址http://blog.csdn.net/caib1109/article/details/51518790 欢迎非商业目的的转载, 作者保留一切权利 */
/** * post Method with head and parameters * @param protocol * @param host * @param port * @param resourcePath * @param headKeyValueArray * @param paraKeyValueList * @return * @throws IOException * @throws URISyntaxException */
public static String postMethod(String protocol, String host, String port, String resourcePath, Header[] headKeyValueArray, List<NameValuePair> paraKeyValueList)
throws IOException, URISyntaxException{
CloseableHttpClient httpclient = (proxy==null)?
HttpClients.createDefault()
:
HttpClients.custom().setRoutePlanner(new DefaultProxyRoutePlanner(proxy)).build();
URIBuilder builder = new URIBuilder().setScheme(protocol).setHost(host);
if(port!=null){
builder.setPort(Integer.parseInt(port));
}
if(resourcePath!=null){
builder.setPath("/" + resourcePath);
}
URI uri = builder.build();
HttpPost httpPost = new HttpPost(uri);
if(headKeyValueArray!=null){
httpPost.setHeaders(headKeyValueArray);
}
httpPost.setEntity(new UrlEncodedFormEntity(paraKeyValueList));
BufferedReader br = null;
InputStreamReader isr = null;
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpclient.execute(httpPost);
System.out.println(httpResponse.getStatusLine());
HttpEntity entity1 = httpResponse.getEntity();
isr = new InputStreamReader(entity1.getContent());
br = new BufferedReader(isr);
StringBuffer httpBody = new StringBuffer();
String resTemp = "";
while ((resTemp = br.readLine()) != null) {
resTemp = resTemp.trim();
if (!"".equals(resTemp))
httpBody.append(resTemp.trim()).append("\n");
}
EntityUtils.consume(entity1);
return httpBody.toString();
} finally {
if(httpResponse!=null){
httpResponse.close();
}
}
}
}
jericho-html-3.4.jar包需要jdk7或以上
依赖于 log4j-api-2.4.1.jar, log4j-core-2.4.1.jar
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class CsdnGet {
protected Logger logger = LogManager.getLogger(this.getClass());
public void dealHtml(){
String str;
str = HttpRequestTool.getMethod("http", "write.blog.csdn.net", "80", "postlist", headerList, null);
//从html页面源码生成jericho树形结构Source
Source source = new Source(str);
//常用获得html标签的方法
//Element ele = source.getElementById("elementid");
//Element ele = source.getFirstElementByClass("elementclass");
//Element ele = source.getAllElementsByClass("elementclass");
//List<Element> eleList = source.getChildElements(); // 获得全部子标签,对分析<table>特别有用
//获得html标签文字内容的String
element.getTextExtractor().toString();
}
}
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.Source;
import org.apache.http.Header;
import org.apache.http.message.BasicHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import dto.Title_Num;
public class CsdnGet {
protected Logger logger = LogManager.getLogger(this.getClass());
private static final String articleListBox = "lstBox",
pageBox = "page_nav";
public void getHtml() {
String str = null;
try {
HttpRequestTool.setProxy("10.37.84.117", "8080");
Header[] headerList = {
new BasicHeader("Host", "write.blog.csdn.net"),
new BasicHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"),
new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
new BasicHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"),
new BasicHeader("Accept-Encoding", "gzip, deflate"),
new BasicHeader(
"Cookie",
"/*用抓包工具获得你的CSDN博客主页的cookie*/"),
new BasicHeader("Connection", "keep-alive") };
// list contains all title_num
List<Title_Num> itemlist = new LinkedList<Title_Num>();
//
str = HttpRequestTool.getMethod("http", "write.blog.csdn.net", "80", "postlist", headerList, null);
Source source = new Source(str);
getArticlesOnePage(source, itemlist);
// check total page 获得总页数的html标签
String pageInfo = source.getFirstElementByClass(pageBox).getFirstElement("span").getTextExtractor().toString();
// 正则表达式获得总页数
Matcher matcher = Pattern.compile("[^\\d](\\d{1,})[^\\d]").matcher(pageInfo);
String sTotalPage = null;
if(matcher.find())
sTotalPage = matcher.group(1);
int iTotalPage = Integer.parseInt(sTotalPage);
if(iTotalPage>1){
for(int i=2;i<=iTotalPage;i++){
String pageSuffix = String.format("postlist/0/0/enabled/%d", i);
str = HttpRequestTool.getMethod("http", "write.blog.csdn.net", "80", pageSuffix, headerList, null);
source = new Source(str);
getArticlesOnePage(source, itemlist);
}
}
// 输出
for(Title_Num title_Num:itemlist){
System.out.println(title_Num.getTitle()+title_Num.getNumber());
}
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void getArticlesOnePage(Source source, List<Title_Num> itemlist){
// get 1st page
List<Element> articles = source.getElementById(articleListBox).getChildElements();
articles.remove(0);
for (Element article : articles) {
int col=0;
Title_Num title_Num = new Title_Num();
for (Element column : article.getChildElements()) {
if(col==0)
title_Num.setTitle(column.getTextExtractor().toString());
if(col==2)
title_Num.setNumber(Integer.parseInt(column.getTextExtractor().toString()));
col++;
}
itemlist.add(title_Num);
}
}
public static void main(String[] args) {
new CsdnGet().getHtml();
}
}
假设我们需要每天爬取自己的CSDN博客标题和阅读数, 和昨天的比较, 分析出每篇文章阅读量增加了多少.
那么, 我们需要每天手动启动爬虫进程吗?
No, Spring的Task组件可以完成定时启动的功能.
版权声明:本文地址http://blog.csdn.net/caib1109/article/details/51518790
欢迎非商业目的的转载, 作者保留一切权利