使用WebMagic 编写 java 网络爬虫

写这个的目的是为了爬歌词,因为喜欢听歌,遇到喜欢的歌就喜欢把歌词下载下来。


WebMacgic 教程地址

http://webmagic.io/docs/zh/posts/ch1-overview/


使用 IDEA 创建 maven工程


下面为工程目录结构

使用WebMagic 编写 java 网络爬虫_第1张图片



下面为源代码


package bean;

import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;

import java.sql.Timestamp;
import java.util.Date;
import java.util.List;

/**
 * @author zhaoshenjiao
 * @Date 2017-04-18 23:12:34
 */
@TargetUrl("http://www.kuwo.cn/yinyue/*")
public class KuWoMusic {
    /**
     * 歌名
     */
//    @ExtractBy(value="div.tit em.f-ff2",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@id='lrcName']/text()")
    private String name;
    /**
     * 歌手
     */

//    @ExtractBy(value="p.des span a",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@class='artist']/span/a/text()")
    private String singer;
    /**
     * 歌词
     */
//    @ExtractBy(value="div.mCSB_container p",type = ExtractByactBy.Type.Css)
    @ExtractBy("//p[@class='lrcItem']")
    private List  lyrics;


    private String  lyric;
    /**
     * 所属专辑
     */
//    @ExtractBy(value="p.des a",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@class='album']/span/a/text()")
    private String album;

    private Timestamp recordTime;
    /**
     * 所属专辑
     */
//    @ExtractBy(value="body",type = ExtractBy.Type.Css)
//    private String body;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getSinger() {
        return singer;
    }

    public void setSinger(String singer) {
        this.singer = singer;
    }

    public List getLyrics() {
        return lyrics;
    }

    public void setLyrics(List lyrics) {
        this.lyrics = lyrics;
    }

    public String getLyric() {
        StringBuilder sb = new StringBuilder();
        for ( String str: lyrics ) {
            sb.append(str);
        }
        return sb.toString();
    }

    public void setLyric(String lyric) {
        StringBuilder sb = new StringBuilder();
        for ( String str: lyrics ) {
            sb.append(str);
        }
        this.lyric = sb.toString();
    }

    public String getAlbum() {
        return album;
    }

    public void setAlbum(String album) {
        this.album = album;
    }

    public Timestamp getRecordTime() {
        return new Timestamp( new Date().getTime());
    }

    public void setRecordTime(Timestamp recordTime) {
        this.recordTime = recordTime;
    }

    @Override
    public String toString() {
        return "[name:"+name +",singer="+singer+",album="+album+",lyric="+lyric+"]";
    }
}


package dao;

import bean.KuWoMusic;
import org.apache.ibatis.annotations.Insert;

/**
 * @author zhaoshenjiao
 * @Date 2017-04-19 00:37:57
 */
public interface KuWoMusicDao {
    @Insert("insert into lyric (`title`,`content`,`source`,`singer`,`album`,`recorder`,`recordTime`,`curStatus`) " +
            "values (#{name},#{lyric},'酷我',#{singer},#{album},'admin',#{recordTime},'2')")
    int add(KuWoMusic kuWoMusic);
}


package dao.pipeline;

import dao.KuWoMusicDao;
import bean.KuWoMusic;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PageModelPipeline;

/**
 * @author zhaoshenjiao
 * @Date 2017-04-19 00:42:41
 */
@Component("KuWoMusicDaoPipeline")
public class KuWoMusicDaoPipeline implements PageModelPipeline {

    ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
    KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
//    @Resource
//    private KuWoMusicDao kuWoMusicDao;

    @Override
    public void process(KuWoMusic kuWoMusic, Task task) {
        //输出歌词信息
        System.out.println(kuWoMusic.toString());
        kuWoMusicDao.add(kuWoMusic);
    }
}

package execute;

import dao.pipeline.KuWoMusicDaoPipeline;
import bean.KuWoMusic;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;

/**
 * 爬虫执行类
 * @author zhaoshenjiao
 * @Date 2017-04-18 23:23:43
 */
public class LyricCrawlerExecutor {
    public static void main(String[] args) {
        //保存到数据库
        OOSpider.create(
                Site.me(),
                new KuWoMusicDaoPipeline(), KuWoMusic.class)
                .addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
                .thread(2)
                .run();
        //输出到控制台
//        OOSpider.create(
//                Site.me(),
//                new ConsolePageModelPipeline(), KuWoMusic.class)
//                .addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
//                .thread(2)
//                .run();

        //测试获取bean
//        ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
//        KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
//
//        //包名(或者是保的完整路径)/配置文件名字(也就是xml文件)
//        ClassPathXmlApplicationContext cpx=new ClassPathXmlApplicationContext ("root-context.xml");
//
//        System.out.println(cpx.getBean("kuWoMusicDao"));
    }
}

log4j.rootLogger=INFO,DEBUG,stdout

log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] -%m%n


#log4j.logger.com.ibatis=debug
#log4j.logger.com.ibatis.common.jdbc.SimpleDataSource=debug
#log4j.logger.com.ibatis.common.jdbc.ScriptRunner=debug
#log4j.logger.com.ibatis.sqlmap.engine.impl.SqlMapClientDelegate=debug
#log4j.logger.java.sql.Connection=debug
#log4j.logger.java.sql.Statement=debug
#log4j.logger.java.sql.PreparedStatement=debug,stdout




	
	
		
		
		
		
		
		
		
		
		
		
	

	
	
		
	

	
	
		
	

	
		
		
	

	
	
	
	


pom.xml文件



    4.0.0

    lyric.crawler
    lyric-crawler
    1.0-SNAPSHOT
    
        
        4.2.0.RELEASE
        
        3.3.0
        
        5.1.29
    
    
        
            us.codecraft
            webmagic-core
            0.6.1
        
        
            us.codecraft
            webmagic-extension
            0.6.1
        
        
        
            org.springframework
            spring-core
            ${spring.version}
        
        
            org.springframework
            spring-context-support
            ${spring.version}
        
        
            org.springframework
            spring-oxm
            ${spring.version}
        
        
            org.springframework
            spring-tx
            ${spring.version}
        
        
            org.springframework
            spring-jdbc
            ${spring.version}
        
        
        
            org.mybatis
            mybatis
            ${mybatis.version}
        
        
        
            org.mybatis
            mybatis-spring
            1.2.3
        
        
        
            mysql
            mysql-connector-java
            ${mysql.version}
        
        
        
            commons-dbcp
            commons-dbcp
            1.4
        
    
    
        lyriccrawler
        
            
                src/main/java
                
                    *.xml
                    *.properties
                    *.tld
                    *.txt
                    *.cfg
                    **/**/**/*.xml
                    **/**/**/**/*.xml
                
            
        
    






工程源代码下载地址

https://github.com/airujingye/lyriccrawler




你可能感兴趣的:(java,网络爬虫,maven,webmagic,mysql)