网络爬虫概述:
pom配置
<dependencies>
<dependency>
<groupId>org.apache.httpcomponentsgroupId>
<artifactId>httpclientartifactId>
<version>4.5.3version>
dependency>
<dependency>
<groupId>org.jsoupgroupId>
<artifactId>jsoupartifactId>
<version>1.10.3version>
dependency>
<dependency>
<groupId>junitgroupId>
<artifactId>junitartifactId>
<version>4.12version>
dependency>
<dependency>
<groupId>org.apache.commonsgroupId>
<artifactId>commons-lang3artifactId>
<version>3.7version>
dependency>
<dependency>
<groupId>commons-iogroupId>
<artifactId>commons-ioartifactId>
<version>2.6version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
<version>1.7.25version>
dependency>
dependencies>
log4j.properties配置
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n
@Test
public void SCrawler()throws Exception{
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("http://www.itcast.cn/");
CloseableHttpResponse response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200){
String html = EntityUtils.toString(response.getEntity(),"UTF-8");
System.out.println(html);
}
httpClient.close();
httpGet.clone();
}
结果为该网页的网页源代码
概述:网络爬虫就是用程序帮助我们访问网络上的资源,我们一直以来都是使用HTTP协议访问互联网的网页,网络爬虫需要编写程序,在这里使用同样的HTTP协议访问网页。这里我们使用Java的HTTP协议客户端 HttpClient这个技术,来实现抓取网页数据。
@Test
public void HttpGet(){
//创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpGet对象,设置url访问地址
HttpGet httpGet = new HttpGet("http://www.itcast.cn/");
//使用httpClient对象发起请求,获取response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
//解析响应
if (response.getStatusLine().getStatusCode() == 200){
String html = EntityUtils.toString(response.getEntity(),"UTF-8");
System.out.println(html);
}
} catch (IOException e) {
e.printStackTrace();
}finally {//释放资源
//判断response是否为null,为null则代表未获取response,不为null则要释放资源
if (response!=null){
try {
response.close();
//重点:释放掉资源后,将它重新赋值为null,因为GC回收资源优先回收null值的
response = null;
} catch (IOException e) {
e.printStackTrace();
}
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
爬取的网址:http://yun.itheima.com/search?keys=Java
//带参数的Get请求
@Test
public void HttpParamGet() throws Exception {
//创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//设置请求的地址:http://yun.itheima.com/search?keys=Java
//1.创建URIBuilder 内容为未设置参数的地址
URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
//2.设置参数
uriBuilder.setParameter("keys","Java");
//如果为多个参数,则使用多次setParameter
//eg:uriBuilder.setParameter("","").setParameter("","")...;
//创建HttpGet对象,设置url访问地址
HttpGet httpGet = new HttpGet(uriBuilder.build());
//使用httpClient对象发起请求,获取response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
//解析响应
if (response.getStatusLine().getStatusCode() == 200){
String html = EntityUtils.toString(response.getEntity(),"UTF-8");
System.out.println(html);
}
} catch (IOException e) {
e.printStackTrace();
}finally {//释放资源
//判断response是否为null,为null则代表未获取response,不为null则要释放资源
if (response!=null){
try {
response.close();
//重点:释放掉资源后,将它重新赋值为null,因为GC回收资源优先回收null值的
response = null;
} catch (IOException e) {
e.printStackTrace();
}
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
@Test
public void HttpPost(){
//创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpPost对象,设置url访问地址
HttpPost httpPost = new HttpPost("http://www.itcast.cn/");
//使用httpClient对象发起请求,获取response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpPost);
//解析响应
if (response.getStatusLine().getStatusCode() == 200){
String html = EntityUtils.toString(response.getEntity(),"UTF-8");
System.out.println(html);
}
} catch (IOException e) {
e.printStackTrace();
}finally {//释放资源
//判断response是否为null,为null则代表未获取response,不为null则要释放资源
if (response!=null){
try {
response.close();
//重点:释放掉资源后,将它重新赋值为null,因为GC回收资源优先回收null值的
response = null;
} catch (IOException e) {
e.printStackTrace();
}
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//带参数的Post请求
@Test
public void HttpParamPost() throws Exception {
//创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpPost对象,设置url访问地址
HttpPost httpPost = new HttpPost("http://www.itcast.cn/search");
//声明list请求,封装表单中的请求参数
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("keys","Java"));
//创建表单的Entity对象,第一个参数时封装好的表单数据,第二个参数是编码
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf8");
//设置表单的Entity对象到Post请求中
httpPost.setEntity(formEntity);
//使用httpClient对象发起请求,获取response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpPost);
//解析响应
if (response.getStatusLine().getStatusCode() == 200){
String html = EntityUtils.toString(response.getEntity(),"UTF-8");
System.out.println(html);
}
} catch (IOException e) {
e.printStackTrace();
}finally {//释放资源
//判断response是否为null,为null则代表未获取response,不为null则要释放资源
if (response!=null){
try {
response.close();
//重点:释放掉资源后,将它重新赋值为null,因为GC回收资源优先回收null值的
response = null;
} catch (IOException e) {
e.printStackTrace();
}
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
概述:如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。
public class httpClientPool {
public static void main(String[] args) throws Exception {
//创建连接池管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
cm.setMaxTotal(100);
//设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);
//使用连接池管理器发起请求
doGet(cm);
doGet(cm);
doGet(cm);
}
private static void doGet(PoolingHttpClientConnectionManager cm) throws Exception {
//不用每次创建Httpclient,而是从连接池中获取Httpclient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet("http://www.itcast.cn/");
CloseableHttpResponse response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
String html = EntityUtils.toString(response.getEntity());
System.out.println(html);
}
response.close();
//重点:不能够关闭httpclient,由连接池管理httpclient
// httpClient.close();
}
}
概述:有时候因为网络,或者目标服务器的原因,请求需要更长的时间才能完成,我们需要自定义相关时间。
public class HttpClientConfig {
public static void main(String[] args) {
//创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpGet对象,设置url访问地址
HttpGet httpGet = new HttpGet("http://www.itcast.cn/");
//配置请求信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间,单位毫秒
.setConnectionRequestTimeout(500)//设置获取连接的最长时间,单位毫秒
.setSocketTimeout(10*1000) //设置数据传输的最长时间,单位毫秒
.build();
//给请求设置请求信息
httpGet.setConfig(config);
//使用httpClient对象发起请求,获取response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
//解析响应
if (response.getStatusLine().getStatusCode() == 200){
String html = EntityUtils.toString(response.getEntity(),"UTF-8");
System.out.println(html);
}
} catch (IOException e) {
e.printStackTrace();
}finally {//释放资源
//判断response是否为null,为null则代表未获取response,不为null则要释放资源
if (response!=null){
try {
response.close();
//重点:释放掉资源后,将它重新赋值为null,因为GC回收资源优先回收null值的
response = null;
} catch (IOException e) {
e.printStackTrace();
}
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
概述:jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
主要功能:
@Test
public void testUrl()throws Exception{
//解析URL地址 第一个参数是访问的URL,第二个是访问的时候超时时间
Document document = Jsoup.parse(new URL("http://www.itcast.cn/"), 1000);
//使用标签选择器,获取Title标签中的内容
String title = document.getElementsByTag("title").first().text();
System.out.println(title);
}
注意:虽然使用Jsoup可以替代HttpClient直接发起请求解析数据,但是往往不会这样用,因为实际的开发过程中,需要使用到多线程,连接池,代理等等方式,而jsoup对这些的支持并不是很好,所以我们一般把jsoup仅仅作为Html解析工具使用
//解析字符串
@Test
public void testString()throws Exception{
//使用工具类读取文件,获取字符串
String file = FileUtils.readFileToString(new File("E:\\code\\java\\java_crawler\\src\\main\\java\\jsoup\\jsoup.html"), "utf8");
//解析字符串
Document doc = Jsoup.parse(file);
String title = doc.getElementsByTag("title").first().text();
System.out.println(title);
}
//文件解析
@Test
public void testFile()throws Exception{
Document doc = Jsoup.parse(new File("E:\\code\\java\\java_crawler\\src\\main\\java\\jsoup\\jsoup.html"), "utf8");
String title = doc.getElementsByTag("title").first().text();
System.out.println(title);
}
元素获取方式:
//dom获取元素
@Test
public void testDOM()throws Exception{
//解析文件
Document doc = Jsoup.parse(new File("E:\\code\\java\\java_crawler\\src\\main\\java\\jsoup\\jsoup.html"), "utf8");
//获取元素
//1. 根据id查询元素getElementById
Element city_bj = doc.getElementById("city_bj");
//2. 根据标签获取元素getElementsByTag
Element element = doc.getElementsByTag("span").first();
//3. 根据class获取元素getElementsByClass
Element element1 = doc.getElementsByClass("class_a class_b").first();
//如果一个class里含有多个class,那么使用其中一个也可以访问其值
//Element element2 = doc.getElementsByClass("class_a").first();
//Element element3 = doc.getElementsByClass("class_b").first();
//4. 根据属性获取元素getElementsByAttribute
Element element4 = doc.getElementsByAttribute("abc").first();
// 也可以通过属性名+值来获取元素
Element element5 = doc.getElementsByAttributeValue("href", "http://sh.itcast.cn").first();
//结果打印
System.out.println(city_bj.text());
System.out.println(element.text());
System.out.println(element1.text());
System.out.println(element4.text());
System.out.println(element5.text());
}
元素中获取数据:
@Test
public void testData()throws Exception{
//解析文件,获取document
Document doc = Jsoup.parse(new File("E:\\code\\java\\java_crawler\\src\\main\\java\\jsoup\\jsoup.html"), "utf8");
//获取元素
Element element = doc.getElementById("test");
String str = "";
//元素中获取数据
//1. 从元素中获取id
str = element.id();
//2. 从元素中获取className
str = element.className();
//按照空格切分,将多个class存放在集合里
Set<String> strList = element.classNames();
//3. 从元素中获取属性的值attr
str = element.attr("id");
str = element.attr("class");
//4. 从元素中获取所有属性attributes
Attributes attributes = element.attributes();
System.out.println(attributes.toString());
//5. 从元素中获取文本内容text
str = element.text();
System.out.println(str);
//System.out.println(strList);
}
tagname: 通过标签查找元素,比如:span
#id: 通过ID查找元素,比如:# city_bj
.class: 通过class名称查找元素,比如:.class_a
[attribute]: 利用属性查找元素,比如:[abc]
[attr=value]: 利用属性值来查找元素,比如:[class=s_name]
//selector选择器
@Test
public void testSelector()throws Exception{
//解析文件,获取document
Document doc = Jsoup.parse(new File("E:\\code\\java\\java_crawler\\src\\main\\java\\jsoup\\jsoup.html"), "utf8");
//tagname: 通过标签查找元素,比如:span
Elements elements = doc.select("span");
for (Element element:elements){
System.out.println(element.text());
}
//#id: 通过ID查找元素,比如:# city_bj
Element e = doc.select("#city_bj").first();
System.out.println(e.text());
//.class: 通过class名称查找元素,比如:.class_a
Element clazz = doc.select(".class_a").first();
System.out.println(clazz.text());
//[attribute]: 利用属性查找元素,比如:[abc]
Element attr = doc.select("[abc]").first();
System.out.println(attr.text());
//[attr=value]: 利用属性值来查找元素,比如:[class=s_name]
Elements attrs = doc.select("[class=s_name]");
for (Element element : attrs) {
System.out.println(element.text());
}
}
el#id: 元素+ID,比如: h3#city_bj
el.class: 元素+class,比如: li.class_a
el[attr]: 元素+属性名,比如: span[abc]
任意组合: 比如:span[abc].s_name
ancestor child: 查找某个元素下子元素,比如:.city_con li 查找"city_con"下的所有li
parent > child: 查找某个父元素下的直接子元素,比如:
.city_con > ul > li 查找city_con第一级(直接子元素)的ul,再找所有ul下的第一级li
parent > *: 查找某个父元素下所有直接子元素
//选择器的组合使用
@Test
public void testSelectors()throws Exception{
//解析文件,获取document
Document document = Jsoup.parse(new File("E:\\code\\java\\java_crawler\\src\\main\\java\\jsoup\\jsoup.html"), "utf8");
//el#id: 元素+ID,比如: h3#city_bj
String str = document.select("h3#city_bj").text();
//el.class: 元素+class,比如: li.class_a
str = document.select("li.class_a").text();
//el[attr]: 元素+属性名,比如: span[abc]
str = document.select("span[abc]").text();
//任意组合,比如:span[abc].s_name
str = document.select("span[abc].s_name").text();
//ancestor child: 查找某个元素下子元素,比如:.city_con li 查找"city_con"下的所有li
str = document.select(".city_con li").text();
//parent > child: 查找某个父元素下的直接子元素,
//比如:.city_con > ul > li 查找city_con第一级(直接子元素)的ul,再找所有ul下的第一级li
str = document.select(".city_con > ul > li").text();
//parent > * 查找某个父元素下所有直接子元素.city_con > *
str = document.select(".city_con > *").text();
System.out.println(str);
}