(一)前言
我的上一篇博客已经说明如何爬取某一个网页的动漫数据,这里重点说一下一个完整的爬虫实例。
和上一篇文章相比,多了的就是动画种类,日文名什么的。
推荐这个爬取博客的:http://blog.csdn.net/qq598535550/article/details/51287630
我也是根据这个学的。
用到的工具有:Intellij IDEA,mySQL, webmagic0.73等
项目github地址:https://github.com/yongzhuo/JavaLearning/tree/master/src/java/Webmagic
(二)详细过程
爬取的内容有:标题,动画种类,日文名,别名,播放时间,播放状态,类型,
原作,监督,制作公司,官网,剧情简介,剧情简介,评分,评分人数,相关url等。
爬取图片如下:
推荐
(1)首先mySQL建表dmzjAnimation
maven依赖:
us.codecraft
webmagic-core
0.7.3
us.codecraft
webmagic-extension
0.7.3
CREATE TABLE `dmzjAnimation` (
`id` int(11) unsigned NOT NULL auto_increment,
`hahawebname` varchar(127) default NULL,
`antag` varchar(255) default NULL,
`japanname` varchar(255) default NULL,
`allname` varchar(255) default NULL,
`year` varchar(255) default NULL,
`state` varchar(255) default NULL,
`tag` varchar(643) default NULL,
`original` varchar(255) default NULL,
`Screenwriter` varchar(255) default NULL,
`company` varchar(255) default NULL,
`website` varchar(511) default NULL,
`content` varchar(2559) default NULL,
`contentdetail` varchar(10240) default NULL,
`goal` varchar(255) default NULL,
`mentotal` varchar(255) default NULL,
`url` varchar(255) default NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
用Navicat for MySQL查看是这样子的
(2)第二步就开始写java代码,主要包括DmzjAnimationProcessor(核心类),DmzjAnimationDao(java连接MySQL类)和DmzjAnimation(数据实体类)
(2.1)dmzjAnimation,实体类对应
package Webmagic.donghua.dmzj.com;
/***Created by mo
*On 2017/10/23 ***12:08.
******/
public class DmzjAnimation {
private int id;
private String hahawebname;// 标题
private String antag;//动画种类
private String japanname;//日文名
private String allname;//别名
private String year;//播放时间
private String state;//播放状态
private String tag;//类型
private String original;//原作
private String screenwriter;//监督
private String company;//制作公司
private String website;//官网
private String content;//剧情简介
private String contentdetail;//剧情简介
private String goal;//评分
private String mentotal;//评分人数
private String url;//相关url
@Override
public String toString() {
return "DmzjAnimation{" +
"id=" + id +
", hahawebname='" + hahawebname + '\'' +
", antag='" + antag + '\'' +
", japanname='" + japanname + '\'' +
", allname='" + allname + '\'' +
", year='" + year + '\'' +
", state='" + state + '\'' +
", tag='" + tag + '\'' +
", original='" + original + '\'' +
", screenwriter='" + screenwriter + '\'' +
", company='" + company + '\'' +
", website='" + website + '\'' +
", content='" + content + '\'' +
", contentdetail='" + contentdetail + '\'' +
", goal=" + goal +
", mentotal=" + mentotal +
", url='" + url + '\'' +
'}';
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getHahawebname() {
return hahawebname;
}
public void setHahawebname(String hahawebname) {
this.hahawebname = hahawebname;
}
public String getAntag() {
return antag;
}
public void setAntag(String antag) {
this.antag = antag;
}
public String getJapanname() {
return japanname;
}
public void setJapanname(String japanname) {
this.japanname = japanname;
}
public String getAllname() {
return allname;
}
public void setAllname(String allname) {
this.allname = allname;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getState() {
return state;
}
public void setState(String state) {
this.state = state;
}
public String getTag() {
return tag;
}
public void setTag(String tag) {
this.tag = tag;
}
public String getOriginal() {
return original;
}
public void setOriginal(String original) {
this.original = original;
}
public String getScreenwriter() {
return screenwriter;
}
public void setScreenwriter(String screenwriter) {
this.screenwriter = screenwriter;
}
public String getCompany() {
return company;
}
public void setCompany(String company) {
this.company = company;
}
public String getWebsite() {
return website;
}
public void setWebsite(String website) {
this.website = website;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getContentdetail() {
return contentdetail;
}
public void setContentdetail(String contentdetail) {
this.contentdetail = contentdetail;
}
public String getGoal() {
return goal;
}
public void setGoal(String goal) {
this.goal = goal;
}
public String getMentotal() {
return mentotal;
}
public void setMentotal(String mentotal) {
this.mentotal = mentotal;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
(2.2)DmzjAnimationProcessor,爬虫逻辑,核心类
package Webmagic.donghua.dmzj.com;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
/***Created by mo
*On 2017/10/23 ***12:09.
******/
public class DmzjAnimationProcessor implements PageProcessor {
int myid = 0;
int size =10;
// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(1000).setSleepTime(1000).setCharset("utf8");
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
DmzjAnimation dmzjAnimation = new DmzjAnimation();
Html html = page.getHtml();
size++;
myid++;
int id = myid;
dmzjAnimation.setId(id);
String hahawebname = html.xpath("//div[@class=\"odd_anim_title_tnew\"]/div[@class=\"tvversion\"]/a/span[@class=\"anim_title_text\"]/h1/text()").get();//得分
dmzjAnimation.setHahawebname(hahawebname);
String goal = html.xpath("//div[@class=\"anim_star\"]/ul/li[@id=\"anim_score_info\"]/span[@class=\"points_text\"]/text()").get();//得分
dmzjAnimation.setGoal(goal);
String mentotalold = html.xpath("//div[@class=\"anim_star\"]/ul/li[@id=\"score_statistics\"]/span[@id=\"score_count_span\"]/text()").get();//人数
String mentotal = mentotalold.replaceAll("人评分","");
dmzjAnimation.setMentotal(mentotal);
String content = html.xpath("//div[@class=\"odd_anim_title_mnew\"]/p/span[@id=\"gamedescshort\"]/text()").get();//内容
dmzjAnimation.setContent(content);
String contentdetail = html.xpath("//div[@class=\"odd_anim_title_mnew\"]/p/span[@id=\"gamedescall\"]/text()").get();//内容
dmzjAnimation.setContentdetail(contentdetail);
System.out.println("hahawebname: "+ hahawebname);
System.out.println("goal: "+goal);
System.out.println("mentotal: "+ mentotal);
System.out.println("content: "+ content);
System.out.println("contentdetail: "+ contentdetail);
List nodes = html.xpath("//div[@class=\"anim_attributenew_text\"]/ul/li").nodes();
for(Selectable item : nodes){
String tmp = item.get();
if(tmp.contains("动画种类")) {//动画种类 : 剧场版
String antag11 = tmp.replaceAll("?[^>]+>","");
String antag = antag11.replaceAll("动画种类 : ","");
System.out.println(antag);
dmzjAnimation.setAntag(antag);
}//日文名 : 暂无
if(tmp.contains("日文名")) {
String japanname11 = tmp.replaceAll("?[^>]+>","");
String japanname = japanname11.replaceAll("日文名 : ","");
if(japanname.contains("暂无")){japanname = null;}
System.out.println(japanname);
dmzjAnimation.setJapanname(japanname);
}//别名 : 大闹天宫 上下集 / The Monkey King
if(tmp.contains("别名")) {
String allname11 = tmp.replaceAll("?[^>]+>","");
String allname = allname11.replaceAll("别名 : ","");
if(allname.contains("暂无")){allname = null;}
System.out.println(allname);
dmzjAnimation.setAllname(allname);
}//首播时间 : 暂无
if(tmp.contains("首播")) {
String year11 = tmp.replaceAll("?[^>]+>","");
String year1111 = year11.replaceAll("首播时间 : ","");
String year = year1111;
if(year1111.contains("暂无")){year = null;}
System.out.println(year);
dmzjAnimation.setYear(year);
}
if(tmp.contains("播放状态")) {
String state11 = tmp.replaceAll("?[^>]+>","");
String state1111 = state11.replaceAll("播放状态 : ","");
String state = state1111;
if(state.contains("暂无")){state = null;}
System.out.println(state);
dmzjAnimation.setState(state);
}
if(tmp.contains("剧情类型")) {
String tag11 = tmp.replaceAll("?[^>]+>","");
String tag1111 = tag11.replaceAll("剧情类型 : ","");
String tag = tag1111.replaceAll(" "," / ");
System.out.println(tag);
dmzjAnimation.setTag(tag);
}//原作 : 暂无
if(tmp.contains("原作")) {
String original11 = tmp.replaceAll("?[^>]+>","");
String original1111 = original11.replaceAll("原作 :","");
String original = original1111;
if(original.contains("暂无")){original = null;}
System.out.println(original);
dmzjAnimation.setOriginal(original);
}//监督 : 万籁鸣 / 唐澄
if(tmp.contains("监督")) {
String screenwriter11 = tmp.replaceAll("?[^>]+>","");
String screenwriter1111 = screenwriter11.replaceAll("监督 :","");
String screenwriter = screenwriter1111;
if(screenwriter.contains("暂无")){screenwriter = null;}
dmzjAnimation.setScreenwriter(screenwriter);
System.out.println(screenwriter);
}//制作公司 : 上海美术电影制片厂
if(tmp.contains("制作公司")) {
String company11 = tmp.replaceAll("?[^>]+>","");
String company1111 = company11.replaceAll("制作公司 :","");
company1111 = company1111 +" / "+ company1111 +"公司";
String company = company1111;
if(company.contains("暂无")){company = null;}
System.out.println(company);
dmzjAnimation.setCompany(company);
}//官方网站 : 暂无
if(tmp.contains("官方网站")) {
String website = tmp.replaceAll(".*?href=|target(.*)","");
if(website.contains("暂无")){website = null;}
System.out.println(website);
dmzjAnimation.setWebsite(website);
}
}
String url = "http://donghua.dmzj.com/donghua_info/"+size+".html";
dmzjAnimation.setUrl(url);
//new DmzjAnimationDao().add(dmzjAnimation);这个是fangdaomysql数据库的,你可以设置其他形式,比如说txt文件什么的
}
public static void main(String[] args) {
int username = 10;
DmzjAnimationProcessor my = new DmzjAnimationProcessor();
long startTime, endTime;
System.out.println("开始爬取...");
for(;username<=15000;username++) {
startTime = System.currentTimeMillis();
Spider.create(my).addUrl("http://donghua.dmzj.com/donghua_info/" + username + ".html").thread(5).run();
endTime = System.currentTimeMillis();
System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒");
}
}
}
(2.3)DmzjAnimationDao,java连接MySQL类,
package Webmagic.donghua.dmzj.com;
/***Created by mo
*On 2017/10/23 ***12:08.
******/
import java.sql.*;
public class DmzjAnimationDao {
private Connection conn = null;
private Statement stmt = null;
public DmzjAnimationDao() {
try {
Class.forName("com.mysql.jdbc.Driver");
//spider是数据库,用户名,密码,数据格式,要自己配置自己的
String url = "jdbc:mysql://localhost:3306/spider?user=root&password=xiemo&useUnicode=true&characterEncoding=UTF8";
conn = (Connection) DriverManager.getConnection(url);
stmt = conn.createStatement();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
System.out.println("连接数据库成功");
}
public int add(DmzjAnimation dmzjAnimation) {
try {
//dmzjAnimation是表名
String sql = "INSERT INTO `spider`.`dmzjAnimation` (`id`,`hahawebname`,`antag`,`japanname`, `allname`, `year`,`state`,`tag`, `original`,`screenwriter`,`company`, `website`, `content`,`contentdetail`,`goal`,`mentotal`,`url`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);";
PreparedStatement ps = conn.prepareStatement(sql);
ps.setInt(1, dmzjAnimation.getId());
ps.setString(2, dmzjAnimation.getHahawebname());
ps.setString(3, dmzjAnimation.getAntag());
ps.setString(4, dmzjAnimation.getJapanname());
ps.setString(5, dmzjAnimation.getAllname());
ps.setString(6, dmzjAnimation.getYear());
ps.setString(7, dmzjAnimation.getState());
ps.setString(8, dmzjAnimation.getTag());
ps.setString(9, dmzjAnimation.getOriginal());
ps.setString(10, dmzjAnimation.getScreenwriter());
ps.setString(11, dmzjAnimation.getCompany());
ps.setString(12, dmzjAnimation.getWebsite());
ps.setString(13, dmzjAnimation.getContent());
ps.setString(14, dmzjAnimation.getContentdetail());
ps.setString(15, dmzjAnimation.getGoal());
ps.setString(16, dmzjAnimation.getMentotal());
ps.setString(17, dmzjAnimation.getUrl());
return ps.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}
return -1;
}
}
(2.4)结果
IDEA:
MySQL:
希望对你有所帮助!