写这个的目的是为了爬歌词,因为喜欢听歌,遇到喜欢的歌就喜欢把歌词下载下来。
WebMacgic 教程地址
http://webmagic.io/docs/zh/posts/ch1-overview/
使用 IDEA 创建 maven工程
下面为工程目录结构
下面为源代码
package bean;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.sql.Timestamp;
import java.util.Date;
import java.util.List;
/**
* @author zhaoshenjiao
* @Date 2017-04-18 23:12:34
*/
@TargetUrl("http://www.kuwo.cn/yinyue/*")
public class KuWoMusic {
/**
* 歌名
*/
// @ExtractBy(value="div.tit em.f-ff2",type = ExtractBy.Type.Css)
@ExtractBy("//p[@id='lrcName']/text()")
private String name;
/**
* 歌手
*/
// @ExtractBy(value="p.des span a",type = ExtractBy.Type.Css)
@ExtractBy("//p[@class='artist']/span/a/text()")
private String singer;
/**
* 歌词
*/
// @ExtractBy(value="div.mCSB_container p",type = ExtractByactBy.Type.Css)
@ExtractBy("//p[@class='lrcItem']")
private List lyrics;
private String lyric;
/**
* 所属专辑
*/
// @ExtractBy(value="p.des a",type = ExtractBy.Type.Css)
@ExtractBy("//p[@class='album']/span/a/text()")
private String album;
private Timestamp recordTime;
/**
* 所属专辑
*/
// @ExtractBy(value="body",type = ExtractBy.Type.Css)
// private String body;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSinger() {
return singer;
}
public void setSinger(String singer) {
this.singer = singer;
}
public List getLyrics() {
return lyrics;
}
public void setLyrics(List lyrics) {
this.lyrics = lyrics;
}
public String getLyric() {
StringBuilder sb = new StringBuilder();
for ( String str: lyrics ) {
sb.append(str);
}
return sb.toString();
}
public void setLyric(String lyric) {
StringBuilder sb = new StringBuilder();
for ( String str: lyrics ) {
sb.append(str);
}
this.lyric = sb.toString();
}
public String getAlbum() {
return album;
}
public void setAlbum(String album) {
this.album = album;
}
public Timestamp getRecordTime() {
return new Timestamp( new Date().getTime());
}
public void setRecordTime(Timestamp recordTime) {
this.recordTime = recordTime;
}
@Override
public String toString() {
return "[name:"+name +",singer="+singer+",album="+album+",lyric="+lyric+"]";
}
}
package dao;
import bean.KuWoMusic;
import org.apache.ibatis.annotations.Insert;
/**
* @author zhaoshenjiao
* @Date 2017-04-19 00:37:57
*/
public interface KuWoMusicDao {
@Insert("insert into lyric (`title`,`content`,`source`,`singer`,`album`,`recorder`,`recordTime`,`curStatus`) " +
"values (#{name},#{lyric},'酷我',#{singer},#{album},'admin',#{recordTime},'2')")
int add(KuWoMusic kuWoMusic);
}
package dao.pipeline;
import dao.KuWoMusicDao;
import bean.KuWoMusic;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
/**
* @author zhaoshenjiao
* @Date 2017-04-19 00:42:41
*/
@Component("KuWoMusicDaoPipeline")
public class KuWoMusicDaoPipeline implements PageModelPipeline {
ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
// @Resource
// private KuWoMusicDao kuWoMusicDao;
@Override
public void process(KuWoMusic kuWoMusic, Task task) {
//输出歌词信息
System.out.println(kuWoMusic.toString());
kuWoMusicDao.add(kuWoMusic);
}
}
package execute;
import dao.pipeline.KuWoMusicDaoPipeline;
import bean.KuWoMusic;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
/**
* 爬虫执行类
* @author zhaoshenjiao
* @Date 2017-04-18 23:23:43
*/
public class LyricCrawlerExecutor {
public static void main(String[] args) {
//保存到数据库
OOSpider.create(
Site.me(),
new KuWoMusicDaoPipeline(), KuWoMusic.class)
.addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
.thread(2)
.run();
//输出到控制台
// OOSpider.create(
// Site.me(),
// new ConsolePageModelPipeline(), KuWoMusic.class)
// .addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
// .thread(2)
// .run();
//测试获取bean
// ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
// KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
//
// //包名(或者是保的完整路径)/配置文件名字(也就是xml文件)
// ClassPathXmlApplicationContext cpx=new ClassPathXmlApplicationContext ("root-context.xml");
//
// System.out.println(cpx.getBean("kuWoMusicDao"));
}
}
log4j.rootLogger=INFO,DEBUG,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] -%m%n
#log4j.logger.com.ibatis=debug
#log4j.logger.com.ibatis.common.jdbc.SimpleDataSource=debug
#log4j.logger.com.ibatis.common.jdbc.ScriptRunner=debug
#log4j.logger.com.ibatis.sqlmap.engine.impl.SqlMapClientDelegate=debug
#log4j.logger.java.sql.Connection=debug
#log4j.logger.java.sql.Statement=debug
#log4j.logger.java.sql.PreparedStatement=debug,stdout
pom.xml文件
4.0.0
lyric.crawler
lyric-crawler
1.0-SNAPSHOT
4.2.0.RELEASE
3.3.0
5.1.29
us.codecraft
webmagic-core
0.6.1
us.codecraft
webmagic-extension
0.6.1
org.springframework
spring-core
${spring.version}
org.springframework
spring-context-support
${spring.version}
org.springframework
spring-oxm
${spring.version}
org.springframework
spring-tx
${spring.version}
org.springframework
spring-jdbc
${spring.version}
org.mybatis
mybatis
${mybatis.version}
org.mybatis
mybatis-spring
1.2.3
mysql
mysql-connector-java
${mysql.version}
commons-dbcp
commons-dbcp
1.4
lyriccrawler
src/main/java
*.xml
*.properties
*.tld
*.txt
*.cfg
**/**/**/*.xml
**/**/**/**/*.xml
工程源代码下载地址
https://github.com/airujingye/lyriccrawler