webmagic爬虫自学(四)爬取CSDN【列表+详情的基本页面组合】的页面

一、搭建webmagic项目环境部分代码,请参考

https://blog.csdn.net/qq_29914837/article/details/89309298

二、爬取CSDN【列表+详情的基本页面组合】的页面

webmagic爬虫自学(四)爬取CSDN【列表+详情的基本页面组合】的页面_第1张图片

package demo.blog.csdn.net2.model;

import java.util.Date;
import java.util.List;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;

public class CsdnBlog {


	  //标题
	  private String article="";
	  
	  //发布日期
	  private String time;
	  
	  //作者
	  private String nick_name="";
	  
	  
	  //阅读数
	  private int  read_count;
	  
	  //标签
	  private List labelList;
	  private String label="";
	  
	  //分类
	  private  List categoryList;
	  private String category="";
	  
	  //内容
	  private String content="";
	  
	  //链接
	  @ExtractByUrl
	  private String url="";
	  
	  public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	//采集时间
	  private Date collect_time;

	public Date getCollect_time() {
		return collect_time;
	}

	public void setCollect_time(Date collect_time) {
		this.collect_time = collect_time;
	}

	public String getArticle() {
		return article;
	}

	public void setArticle(String article) {
		this.article = article;
	}

	public String getTime() {
		return time;
	}

	public void setTime(String time) {
		this.time = time;
	}

	public String getNick_name() {
		return nick_name;
	}

	public void setNick_name(String nick_name) {
		this.nick_name = nick_name;
	}

	public int getRead_count() {
		return read_count;
	}

	public void setRead_count(int read_count) {
		this.read_count = read_count;
	}

	public List getLabelList() {
		return labelList;
	}

	public void setLabelList(List labelList) {
		this.labelList = labelList;
	}

	public List  getCategoryList() {
		return categoryList;
	}

	public void setCategoryList(List  categoryList) {
		this.categoryList = categoryList;
	}

	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	public String getLabel() {
		return label;
	}

	public void setLabel(String label) {
		this.label = label;
	}

	public String getCategory() {
		return category;
	}

	public void setCategory(String category) {
		this.category = category;
	}
	  
}

package demo.blog.csdn.net2.pipeline;

import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import util.JdbcUtil;

import java.sql.SQLException;
import java.sql.PreparedStatement;
import java.sql.Connection;
import java.sql.Date;

public class CsdnBlogPipeline implements Pipeline {

	@Override
	public void process(ResultItems resultItems, Task task) {
		
		if (resultItems.get("article") != null) {

			String article = resultItems.get("article").toString();
			String time = resultItems.get("time").toString();
			String nick_name = resultItems.get("nick_name").toString();
			int read_count = Integer.parseInt(resultItems.get("read_count").toString());
			String label = resultItems.get("label").toString();
			String category = resultItems.get("category").toString();
			String content = resultItems.get("content").toString();
			String url = resultItems.get("url").toString();

			System.out.println(article);

			Connection conn = JdbcUtil.getConnection();
			String sql = "INSERT INTO csdn ( article , time , nick_name , read_count , label , category , content , url ,collect_time) VALUES (?,?,?,?,?,?,?,?,?)";
			try {
				PreparedStatement ptmt = conn.prepareStatement(sql);
				ptmt.setString(1, article);
				ptmt.setString(2, time);
				ptmt.setString(3, nick_name);
				ptmt.setInt(4, read_count);
				ptmt.setString(5, label);
				ptmt.setString(6, category);
				ptmt.setString(7, content);
				ptmt.setString(8, url);
				ptmt.setDate(9, new Date(System.currentTimeMillis()));

				ptmt.execute();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}

	}

}

解析mvc模式的 【列表+详情的基本页面组合】的页面

webmaigc不支持对列表分页提供自动下一页url获取,需要开发者手动将下一页url添加进page中时,需要判断当前爬出的列表页url下是否存在文章的url。

package demo.blog.csdn.net2;

import java.util.List;
import org.apache.log4j.Logger;
import demo.blog.csdn.net2.pipeline.CsdnBlogPipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 爬取网址:https://blog.csdn.net/qq_29914837/article/list/0?
 * 解析mvc模式的  【列表+详情的基本页面组合】的页面,
 * @author yl
 */
public class CsdnBlogCrawler implements PageProcessor {
	
	private Logger logger = Logger.getLogger(CsdnBlogCrawler.class);

	public static final String csdn_name = "qq_29914837";

	public static final String URL_LIST = "https://blog\\.csdn\\.net/" + csdn_name + "/article/list/[0-9]*?";

	public static final String URL_POST = "https://blog\\.csdn\\.net/" + csdn_name + "/article/details/[0-9]*";
	
	private Site site = Site.me().setDomain("blog.csdn.net").setSleepTime(3000).setUserAgent(
			"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");

	@Override
	public void process(Page page) {

		try {
			// 列表页
			if (page.getUrl().regex(URL_LIST).match()) {

				String url = page.getUrl().toString();
				int i = Integer.parseInt(url.substring(url.lastIndexOf("/") + 1, url.lastIndexOf("?")));
                
				//webmaigc不支持对列表分页提供自动下一页url获取,需要开发者手动在将下一页url添加进page中时,需要判断当前爬出的列表页url下是否存在文章。
				if (page.getHtml().xpath("//div[@id=\"mainBox\"]/main/div[2]").links().regex(URL_POST).match()) {

					page.addTargetRequests(page.getHtml().xpath("//div[@id=\"mainBox\"]/main/div[2]").links().regex(URL_POST).all());

					page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
					i++;
					//爬出列表页的时候,自动的将下一页的url手动拼接,添加进page中
					page.addTargetRequest("https://blog.csdn.net/" + csdn_name + "/article/list/" + i + "?");
				}

				// 文章页
			} else {
				page.putField("article", page.getHtml().xpath("//h1[@class='title-article']/text()"));
				page.putField("time", page.getHtml().xpath("//span[@class='time']/text()"));
				page.putField("nick_name", page.getHtml().xpath("//a[@class='follow-nickName']/text()"));
				page.putField("read_count", page.getHtml().xpath("//span[@class='read-count']/regex('\\d+')"));
				//对于list数据采用 all()方法获取,setValue()方法将采集的list数据,拼接成字符串
				page.putField("label", setValue(page.getHtml()
						.xpath("//span[@class='tags-box artic-tag-box']//a[@class='tag-link']/text()").all()));
				page.putField("category", setValue(
						page.getHtml().xpath("//div[@class='tags-box space']//a[@class='tag-link']/text()").all()));
				page.putField("content", page.getHtml().xpath("//div[@id='content_views']/html()"));
				page.putField("url", page.getUrl());

				if (page.getResultItems().get("article") == null) {
					page.setSkip(true);
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("CSDN页面解析有误!");
		}
	}

	@Override
	public Site getSite() {
		return site;
	}

	//setValue()方法将采集的list数据,拼接成字符串
	private static String setValue(List list) {
		StringBuilder sb = new StringBuilder();
		if (list.size() > 0 && list != null) {
			for (String string : list) {
				sb.append(string).append("|");
			}
			return sb.substring(0, sb.lastIndexOf("|"));
		} else {
			return "";
		}
	}

	public static void main(String[] args) {
		Spider.create(new CsdnBlogCrawler()).addUrl("https://blog.csdn.net/" + csdn_name + "/article/list/1?")
				.addPipeline(new CsdnBlogPipeline()).thread(5).run();
	}

}

run as 运行 main 方法 ,控制台输出,代表爬虫成功,可以查看数据库是否有爬虫的文章信息。

webmagic爬虫自学(四)爬取CSDN【列表+详情的基本页面组合】的页面_第2张图片


如果你觉得本篇文章对你有所帮助的话,麻烦请点击头像右边的关注按钮,谢谢!

技术在交流中进步,知识在分享中传播

你可能感兴趣的:(webmagic爬虫)