网络爬虫(Web Crawler),又称为网络蜘蛛(Web Spider)或 Web 信息采集器,是一种按照一定规则,自动抓取或下载网络信息的计算机程序或自动化脚本,是目前搜索引擎的重要组成部分。
狭义上理解:利用标准的 HTTP 协议,根据网络超链接(如https://www.baidu.com/ 和 Web 文档检索的方法(如深度优先)遍历万维网信息空间的软件程序。
功能上理解:确定待爬的 URL 队列,获取每个 URL 对应的网页内容(如 HTML/JSON),解析网页内容,并存储对应的数据。
网络爬虫按照系统架构和实现技术,大致可以分为以下几种类型:通用网络爬虫(General Purpose Web Crawler)、聚焦网络爬虫(Focused Web Crawler)、增量式网络爬虫(Incremental Web Crawler)、深层网络爬虫(Deep Web Crawler)。实际的网络爬虫系统通常是几种爬虫技术相结合实现的。
通用网络爬虫的爬取范围和数量巨大,对于爬行速度和存储空间要求较高,对于爬行页面的顺序要求较低,通常采用并行工作方式,有较强的应用价值。
和通用爬虫相比,聚焦爬虫只需要爬行与主题相关的页面,极大地节省了硬件和网络资源,保存的页面也由于数量少而更新快,可以很好地满足一些特定人群对特定领域信息的需求。
通常在设计聚焦网络爬虫时,需要加入链接和内容筛选模块。一个常见的案例是基于关键字获取符合用户需求的数据。
增量网络爬虫避免了重复采集数据,可以减小时间和空间上的耗费。通常在设计网络爬虫时,需要在数据库中,加入时间戳,基于时间戳上的先后,判断程序是否继续执行。
常见的案例有:论坛帖子评论数据的采集;天气数据的采集;新闻数据的采集;股票数据的采集等。
项目所用到的jar包
<dependency>
<groupId>org.apache.httpcomponentsgroupId>
<artifactId>httpclientartifactId>
<version>4.5.5version>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
<version>5.1.32version>
dependency>
<dependency>
<groupId>commons-dbutilsgroupId>
<artifactId>commons-dbutilsartifactId>
<version>1.7version>
dependency>
<dependency>
<groupId>org.apache.commonsgroupId>
<artifactId>commons-dbcp2artifactId>
<version>2.5.0version>
dependency>
<dependency>
<groupId>org.jsoupgroupId>
<artifactId>jsoupartifactId>
<version>1.11.3version>
dependency>
<dependency>
<groupId>com.google.guavagroupId>
<artifactId>guavaartifactId>
<version>26.0-jreversion>
dependency>
package com.sun.backstage.entity;
public class JdModel {
private String itemId;
private String itemName;
private String itemPrice;
private String itemImgUrl;
private String itemUrl;
private String shopName;
private String commentNumber;
private String shopUrl;
private String crawlerTime;
private String type;
public String getItemId() {
return itemId;
}
public void setItemId(String itemId) {
this.itemId = itemId;
}
public String getItemName() {
return itemName;
}
public void setItemName(String itemName) {
this.itemName = itemName;
}
public String getItemPrice() {
return itemPrice;
}
public void setItemPrice(String itemPrice) {
this.itemPrice = itemPrice;
}
public String getItemImgUrl() {
return itemImgUrl;
}
public void setItemImgUrl(String itemImgUrl) {
this.itemImgUrl = itemImgUrl;
}
public String getItemUrl() {
return itemUrl;
}
public void setItemUrl(String itemUrl) {
this.itemUrl = itemUrl;
}
public String getShopName() {
return shopName;
}
public void setShopName(String shopName) {
this.shopName = shopName;
}
public String getCommentNumber() {
return commentNumber;
}
public void setCommentNumber(String commentNumber) {
this.commentNumber = commentNumber;
}
public String getShopUrl() {
return shopUrl;
}
public void setShopUrl(String shopUrl) {
this.shopUrl = shopUrl;
}
public String getCrawlerTime() {
return crawlerTime;
}
public void setCrawlerTime(String crawlerTime) {
this.crawlerTime = crawlerTime;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}
package com.sun.backstage.config.util;
import com.google.common.collect.Lists;
import org.apache.http.*;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
public class HttpRequest {
private HttpClient httpClient;
public HttpEntity getEntityByHttpGetMethod(String url) {
httpClient = HttpClients.custom().build(); //初始化httpclient
HttpGet httpget = new HttpGet(url); //使用的请求方法
//请求头配置
httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
httpget.setHeader("Accept-Encoding", "gzip, deflate");
httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
httpget.setHeader("Cache-Control", "max-age=0");
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36"); //这项内容很重要
HttpResponse response = null;
try {
response = httpClient.execute(httpget);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
HttpEntity httpEntity = response.getEntity(); //获取网页内容流
return httpEntity;
}
public String getHTMLContentByHttpGetMethod(String url, String code) {
//获取Html内容
try {
return EntityUtils.toString(getEntityByHttpGetMethod(url), code);
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
//请求页面html文件
public static String getRawHtml(String keyword, int pagenumber, int firstprice, int endprive) throws URISyntaxException, ClientProtocolException, IOException {
int page = pagenumber * 2 - 1;
String url = "https://search.jd.com/s_new.php";
List<NameValuePair> nameAndValueList = new ArrayList<NameValuePair>();
nameAndValueList.add(new BasicNameValuePair("keyword", keyword));
nameAndValueList.add(new BasicNameValuePair("enc", "utf-8"));
nameAndValueList.add(new BasicNameValuePair("qrst", "1"));
nameAndValueList.add(new BasicNameValuePair("rt", "1"));
nameAndValueList.add(new BasicNameValuePair("stop", "1"));
nameAndValueList.add(new BasicNameValuePair("vt", "2"));
nameAndValueList.add(new BasicNameValuePair("wq", keyword));
nameAndValueList.add(new BasicNameValuePair("ev", "exprice_" + firstprice + "-" + endprive + "^"));
nameAndValueList.add(new BasicNameValuePair("uc", "0"));
nameAndValueList.add(new BasicNameValuePair("page", page + ""));
URI uri = new URIBuilder(url).addParameters(nameAndValueList).build();
HttpClientContext httpClientContext = HttpClientContext.create();
List<Header> headerList = Lists.newArrayList(); //请求头添加
headerList.add(new BasicHeader(HttpHeaders.ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9," +
"image/webp,image/apng,*/*;q=0.8"));
headerList.add(new BasicHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"));
headerList.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate"));
headerList.add(new BasicHeader(HttpHeaders.CACHE_CONTROL, "max-age=0"));
headerList.add(new BasicHeader(HttpHeaders.CONNECTION, "keep-alive"));
headerList.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4,ja;q=0.2," +
"de;q=0.2"));
headerList.add(new BasicHeader(HttpHeaders.HOST, "search.jd.com"));
headerList.add(new BasicHeader(HttpHeaders.REFERER, "https://search.jd.com/search"));
//httpClient初始化
HttpClient httpClient = HttpClients.custom().setDefaultHeaders(headerList).build();
//获取响应内容
HttpUriRequest httpUriRequest = RequestBuilder.get().setUri(uri).build();
httpClient.execute(httpUriRequest, httpClientContext);
HttpResponse httpResponse = httpClient.execute(httpUriRequest, httpClientContext);
//获取返回结果中的实体
HttpEntity entity = httpResponse.getEntity();
String html = "" + EntityUtils.toString(entity) + "";
return html;
}
}
package com.sun.backstage.config.parse;
import com.sun.backstage.config.util.TimeUtils;
import com.sun.backstage.entity.JdModel;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class JdParse {
public static List<JdModel> getData(String html, String keyword) {
//获取的数据,存放在集合中
List<JdModel> data = new ArrayList<JdModel>();
//采用Jsoup解析
Document doc = Jsoup.parse(html);
//采取html标签中的内容
Elements elements = doc.select("li[class=gl-item]");
for (Element ele : elements) {
String itemId = ele.attr("data-sku");
String itemPrice = ele.select("div[class=p-price]").select("strong").select("i").text();
String itemName = ele.select("div[class~=p-name?]").select("em").text(); //书籍类产品
String commentNumber = ele.select("div[class=p-commit]").text();
String imgurl =ele.select("div[class=p-img]").select("a").attr("href");//本来是要获取图片的url但是未获取到,使用a标签里面的href代替
boolean b=imgurl.startsWith("https:");
if(b==false){
imgurl="https:"+imgurl;
};
String itemurl = "https://item.jd.com/" + itemId + ".html";
String type = keyword;
String shopname = "";
String shopurl = "";
if (ele.select("div[class=p-shop]").select("a[class=curr-shop]").text().length() != 0) {
shopname = ele.select("div[class=p-shop]").select("a[class=curr-shop]").text();
shopurl = "https:" + ele.select("div[class=p-shop]").select("a[class=curr-shop]").attr("href");
} else {
shopname = "京东自营";
shopurl = "https://mall.jd.com/index-" + ele.select("div[class=p-shop]").attr("data-shopid") + ".html";
}
String crawl_time = TimeUtils.GetNowDate("yyyy-MM-dd HH:mm:ss");
//创建一个对象,这里可以看出,使用Model的优势,直接进行封装
JdModel jdModel = new JdModel();
//对象的值
jdModel.setItemId(itemId);
jdModel.setItemName(itemName);
jdModel.setItemPrice(itemPrice);
jdModel.setCommentNumber(commentNumber);
jdModel.setItemImgUrl(imgurl);
jdModel.setItemUrl(itemurl);
jdModel.setShopName(shopname);
jdModel.setShopUrl(shopurl);
jdModel.setCrawlerTime(crawl_time);
jdModel.setType(type);
//将每一个对象的值,保存到List集合中
data.add(jdModel);
}
return data;
}
}
package com.sun.backstage.config.util;
import java.text.DateFormat;
import java.text.DecimalFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
public class TimeUtils {
public static void main(String[] args) throws ParseException {
List<String> monthlist = TimeUtils.YearMonth(2017, 2018);
for (int i = 0; i < monthlist.size(); i++) {
System.out.println(monthlist.get(i));
}
String time = getMonth("2002-1-08 14:50:38");
System.out.println(time);
System.out.println(getDay("2002-1-08 14:50:38"));
System.out.println(TimeUtils.parseTime("2016-05-19 19:17", "yyyy-MM-dd HH:mm"));
String data = getNowMonth();
System.out.println(data);
}
//获取当前时间
public static String GetNowDate(String formate) {
String temp_str = "";
Date dt = new Date();
SimpleDateFormat sdf = new SimpleDateFormat(formate);
temp_str = sdf.format(dt);
return temp_str;
}
//获取当前月
public static String getMonth(String time) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM");
Date date = null;
try {
date = sdf.parse(time);
Calendar cal = Calendar.getInstance();
cal.setTime(date);
} catch (ParseException e) {
e.printStackTrace();
}
return sdf.format(date);
}
//获取当前日期
public static String getDay(String time) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date date = null;
try {
date = sdf.parse(time);
Calendar cal = Calendar.getInstance();
cal.setTime(date);
} catch (ParseException e) {
e.printStackTrace();
}
return sdf.format(date);
}
//输入时间,解析成"yyyy-MM-dd HH:mm:ss"格式
public static Date parseTime(String inputTime) throws ParseException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date = sdf.parse(inputTime);
return date;
}
//将时间转成成字符型
public static String dateToString(Date date, String type) {
DateFormat df = new SimpleDateFormat(type);
return df.format(date);
}
//将输入的时间,转化成指定格式
public static Date parseTime(String inputTime, String timeFormat) throws ParseException {
SimpleDateFormat sdf = new SimpleDateFormat(timeFormat);
Date date = sdf.parse(inputTime);
return date;
}
public static Calendar parseTimeToCal(String inputTime, String timeFormat) throws ParseException {
SimpleDateFormat sdf = new SimpleDateFormat(timeFormat);
Date date = sdf.parse(inputTime);
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
return calendar;
}
public static int getDaysBetweenCals(Calendar cal1, Calendar cal2) throws ParseException {
return (int) ((cal2.getTimeInMillis() - cal1.getTimeInMillis()) / (1000 * 24 * 3600));
}
//长整型转化为时间
public static Date parseTime(long inputTime) {
// SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date = new Date(inputTime);
return date;
}
public static String parseTimeString(long inputTime) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date = new Date(inputTime);
return sdf.format(date);
}
public static String parseStringTime(String inputTime) {
String date = null;
try {
Date date1 = new SimpleDateFormat("yyyyMMddHHmmss").parse(inputTime);
date = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(date1);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return date;
}
public static List<String> YearMonth(int year) {
List<String> yearmouthlist = new ArrayList<String>();
for (int i = 1; i < 13; i++) {
DecimalFormat dfInt = new DecimalFormat("00");
String sInt = dfInt.format(i);
yearmouthlist.add(year + sInt);
}
return yearmouthlist;
}
//获取从起始年份到目标年份所有的月
public static List<String> YearMonth(int startyear, int finistyear) {
List<String> yearmouthlist = new ArrayList<String>();
for (int i = startyear; i < finistyear + 1; i++) {
for (int j = 1; j < 13; j++) {
DecimalFormat dfInt = new DecimalFormat("00");
String sInt = dfInt.format(j);
yearmouthlist.add(i + "" + sInt);
}
}
return yearmouthlist;
}
public static List<String> TOAllDay(int year) {
List<String> daylist = new ArrayList<String>();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
int m = 1;//月份计数
while (m < 13) {
int month = m;
Calendar cal = Calendar.getInstance();//获得当前日期对象
cal.clear();//清除信息
cal.set(Calendar.YEAR, year);
cal.set(Calendar.MONTH, month - 1);//1月从0开始
cal.set(Calendar.DAY_OF_MONTH, 1);//设置为1号,当前日期既为本月第一天
System.out.println("##########___" + sdf.format(cal.getTime()));
int count = cal.getActualMaximum(Calendar.DAY_OF_MONTH);
System.out.println("$$$$$$$$$$________" + count);
for (int j = 0; j <= (count - 2); ) {
cal.add(Calendar.DAY_OF_MONTH, +1);
j++;
daylist.add(sdf.format(cal.getTime()));
}
m++;
}
return daylist;
}
//获取昨天的日期
public static String getyesterday() {
Calendar cal = Calendar.getInstance();
cal.add(Calendar.DATE, -1);
String yesterday = new SimpleDateFormat("yyyy-MM-dd ").format(cal.getTime());
return yesterday;
}
//获取当前年份月份
public static String getNowMonth() {
Calendar cal = Calendar.getInstance();
int year = cal.get(Calendar.YEAR);
int month = cal.get(Calendar.MONTH) + 1;
DecimalFormat dfInt = new DecimalFormat("00");
String sInt = dfInt.format(month);
return year + "" + sInt;
}
}
package com.sun.backstage.service;
public interface JdService {
public int insert(String word, int first, int end, int number);
}
package com.sun.backstage.service.Impl;
import com.sun.backstage.config.parse.JdParse;
import com.sun.backstage.config.util.HttpRequest;
import com.sun.backstage.dao.JdMapper;
import com.sun.backstage.entity.JdModel;
import com.sun.backstage.service.JdService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.List;
@Service
public class JDServiceImpl implements JdService {
@Autowired(required = false)
private JdMapper jdMapper;
@Override
public int insert(String word, int first, int end, int number) {
//设置关键词
String keyword = word;
//价格区间
int firstprice = first;
int endprice = end;
//输入爬取的总页数
int sumpagenumber = number;
for (int i = 1; i <= sumpagenumber; i++) {
String html = null;
try {
html = HttpRequest.getRawHtml(keyword, i, firstprice, endprice);
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
List<JdModel> dataslist = JdParse.getData(html, keyword);
//循环输出抓取的数据
for (JdModel jd : dataslist) {
System.out.println("itemId:" + jd.getItemId() + "\t" + "itemName:" + jd.getItemName() + "\t" + "itemPrice:" + jd.getItemPrice() + "\tcommentnumber:" + jd.getCommentNumber() +"\timgurl:"+jd.getItemImgUrl()+ "\titemurl:" + jd.getItemUrl() + "\tshopname:" + jd.getShopName() + "\tshopurl:" + jd.getShopUrl() + "\tcrawl_time:" + jd.getCrawlerTime()+"\ttype:"+jd.getType());
}
jdMapper.insert(dataslist);
}
return 0;
}
}
package com.sun.backstage.controller.in;
import com.sun.backstage.service.JdService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
//这里不能使用Controller
@RestController
@RequestMapping("JD")
public class JD {
@Autowired
private JdService jdService;
@RequestMapping("insert")
public void insert(@RequestParam String word, @RequestParam int first, @RequestParam int end, @RequestParam int number) {
jdService.insert(word,first,end,number);
}
}
package com.sun.backstage.dao;
import com.sun.backstage.entity.JdModel;
import org.apache.ibatis.annotations.Param;
import java.util.List;
public interface JdMapper {
int insert(@Param("jdModels")List<JdModel> jdModels);
}
<mapper namespace="com.sun.backstage.dao.JdMapper">
<insert id="insert" parameterType="java.util.List">
INSERT INTO tb_jd_info(pk, item_id, item_name, item_price, item_img_url, item_url, shop_name, comment_number, shop_url, crawler_time, type) VALUES
<foreach collection="jdModels" item="item" index="index" separator=",">
(UUID(),#{item.itemId},#{item.itemName},#{item.itemPrice},#{item.itemImgUrl},#{item.itemUrl},#{item.shopName},#{item.commentNumber},#{item.shopUrl},#{item.crawlerTime},#{item.type})
foreach>
insert>
mapper>