博客园博文爬取 标签爬取(含源代码)

爬取思路:

1,在首页上爬取这些推荐博文:https://www.cnblogs.com/

博客园博文爬取 标签爬取(含源代码)_第1张图片

 

 

 2,根据这些推荐博文进一步到发布这些推荐博文的博主主页中:

博客园博文爬取 标签爬取(含源代码)_第2张图片

 

 

 3,爬取标签的话可以查看这些博主的标签 只用在博主主页后加一个/tag/就可以跳转到标签页中

博客园博文爬取 标签爬取(含源代码)_第3张图片

 

 

 4,如果要爬取内容的话,就可以进入这些博主的所有页面中进行爬取

下面是我的代码:

  1 package use;
  2 
  3 import java.sql.Connection;
  4 import java.sql.PreparedStatement;
  5 import java.util.ArrayList;
  6 import java.util.Date;
  7 import java.util.List;
  8 
  9 import com.dao.ClarifyDao;
 10 import com.dao.InfoDao;
 11 import org.jsoup.Jsoup;
 12 import org.jsoup.nodes.Document;
 13 
 14 import us.codecraft.webmagic.Page;
 15 import us.codecraft.webmagic.Site;
 16 import us.codecraft.webmagic.Spider;
 17 import us.codecraft.webmagic.processor.PageProcessor;
 18 
 19 public class 博客园内容 implements PageProcessor {
 20     static int nn=0;
 21     static String regEx="[\n`'' ]";
 22    // static String regEx="[\n`~!@#$%^&()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?? ]";
 23     static String aa = "";//这里是将特殊字符换为aa字符串," "代表直接去掉
 24     private static Connection conn = null;
 25 
 26     private static PreparedStatement ps = null;
 27     // 标题和链接获取
 28 
 29     private static String TITLEQUERY = "div.post_item_body h3 a.titlelnk";
 30 
 31     private static String TITLE = "div.post h1 a.postTitle2";
 32     // 作者
 33 
 34     private static String AUTHORQUERY = "div.post_item_foot a.lightblue ";
 35 
 36 
 37     //初始化带爬取网页地址
 38     private static List urls() {
 39         List listUrl=new ArrayList();
 40         for(int i=1;i<=200;i++) {
 41             listUrl.add("https://www.cnblogs.com/sitehome/p/"+i);
 42 
 43         }
 44         listUrl.toArray(new String[listUrl.size()]);
 45         return listUrl;
 46     }
 47     private static void add_urls_child(Page page) {
 48         List listUrl=new ArrayList();
 49         listUrl= page.getHtml().xpath("//*[@id=\"post_list\"]//*/div[2]/div/a//@href").all();
 50 
 51         listUrl.toArray(new String[listUrl.size()]);
 52         page.addTargetRequests(listUrl);
 53 
 54     }
 55 
 56     private static void add_urls_child_page(Page page) {
 57         List listUrl=new ArrayList();
 58         listUrl= page.getHtml().xpath("//div[@class=\"postTitle\"]/a//@href").all();
 59 
 60         listUrl.toArray(new String[listUrl.size()]);
 61         page.addTargetRequests(listUrl);
 62 
 63     }
 64 
 65     //jsoup根据html字符串和语法来获取内容
 66     private static String selectDocumentText(String htmlText,String Query) {
 67         Document doc=Jsoup.parse(htmlText);
 68         String select=doc.select(Query).text();
 69         return select;
 70     }
 71 
 72     //jsoup根据html字符串和语法获取链接地址
 73     private static String selectDocumentLink(String htmlText,String Query) {
 74         Document doc=Jsoup.parse(htmlText);
 75         String select=doc.select(Query).attr("href");
 76         return select;
 77     }
 78 
 79     @Override
 80     public Site getSite() {
 81         return Site.me().setSleepTime(1000).setRetryTimes(10);
 82     }
 83 
 84     //编写抽取逻辑
 85     @Override
 86     public void process(Page page) {
 87         nn=nn+1;
 88         if(nn==1)
 89         {
 90             System.out.println("TTTTTTTTTTTTT");
 91             page.addTargetRequests(urls());
 92         }
 93 
 94         String str = page.getUrl().get();
 95 
 96         if(str.matches("https://www.cnblogs.com/sitehome/p/[0-9]+"))
 97         {
 98             System.out.println("AAAAA");
 99             add_urls_child(page);
100         }
101         else if(str.matches("https://www.cnblogs.com/[A-Za-z0-9_-]+/"))
102         {
103             System.out.println("BBBBBBB");
104             add_urls_child_page(page);
105         }else
106         {
107             System.out.println("DDDDDD");
108 
109             String title=page.getHtml().xpath("//*[@id='cb_post_title_url']//text()").get();
110 
111             String URL=page.getUrl().get();
112 
113 
114 
115             String author=page.getHtml().xpath("//*[@id='Header1_HeaderTitle']//text()").get();
116             List values=new ArrayList();
117             values=page.getHtml().xpath("//*[@id='cnblogs_post_body']//*//text()").all();
118             String info="";
119             for(String value:values)
120             {
121                 info+=value;
122             }
123             info=info.replaceAll(regEx, aa);
124             System.out.println("Title:\t"+title);
125             System.out.println("AUTHOR:\t"+author);
126             System.out.println(  "VALUE:\t"+info);
127             System.out.println("URL:\t"+URL);
128             ClarifyDao.add("blog_info","",title,author,info,URL);
129 
130         }
131 
132 
133 
134 
135  /*
136         //定义如何抽取页面信息
137 
138         List htmls=page.getHtml().xpath("//div[@class='post_item']/html()").all();
139 
140        // List javaBokes=new ArrayList();
141         for(String html:htmls) {
142           //  JavaBokeModel javaBoke=new JavaBokeModel();
143             //标题和链接
144             String title=selectDocumentText(html,TITLEQUERY);
145 
146             String linke=selectDocumentLink(html,TITLEQUERY);
147             //作者和作者主页
148             String author=selectDocumentText(html,AUTHORQUERY);
149 
150             System.out.println(
151                     "TITLE\t"+title+
152                             "Link\t"+linke+
153                             "Author\t"+author
154             );
155 
156 
157 
158         }
159  */
160         //File.WriteStringToFile2(javaBokes);
161 
162 
163     }
164 
165     public static void main(String[] args) {
166         long startTime,endTime;
167         //DBUtil.getConnection();
168         startTime=new Date().getTime();
169         InfoDao.delete("blog_info");
170         Spider create=Spider.create(new 博客园内容());
171         create.addUrl("https://www.cnblogs.com/").thread(5).run();
172         try {
173             ps.close();
174             conn.close();
175         }catch(Exception e) {
176 
177         }
178         endTime=new Date().getTime();
179         System.out.println("用时为:"+(endTime-startTime)/1000+"s");
180 
181     }
182 
183 }
博文内容代码
  1 package use;
  2 
  3 import java.sql.Connection;
  4 import java.sql.PreparedStatement;
  5 import java.util.ArrayList;
  6 import java.util.Date;
  7 import java.util.List;
  8 
  9 import com.dao.InfoDao;
 10 import org.jsoup.Jsoup;
 11 import org.jsoup.nodes.Document;
 12 
 13 import us.codecraft.webmagic.Page;
 14 import us.codecraft.webmagic.Site;
 15 import us.codecraft.webmagic.Spider;
 16 import us.codecraft.webmagic.processor.PageProcessor;
 17 
 18 public class 博客园标签 implements PageProcessor {
 19     static int nn=0;
 20     private static Connection conn = null;
 21 
 22     private static PreparedStatement ps = null;
 23     // 标题和链接获取
 24 
 25     private static String TITLEQUERY = "div.post_item_body h3 a.titlelnk";
 26 
 27     private static String TITLE = "div.post h1 a.postTitle2";
 28     // 作者
 29 
 30     private static String AUTHORQUERY = "div.post_item_foot a.lightblue ";
 31 
 32 
 33     //初始化带爬取网页地址
 34     private static List urls() {
 35         List listUrl=new ArrayList();
 36         for(int i=2;i<=200;i++) {
 37             listUrl.add("https://www.cnblogs.com/sitehome/p/"+i);
 38 
 39         }
 40         listUrl.toArray(new String[listUrl.size()]);
 41         return listUrl;
 42     }
 43     private static void add_urls_child(Page page) {
 44         List listUrl=new ArrayList();
 45         List Urls=new ArrayList();
 46         Urls= page.getHtml().xpath("//*[@id=\"post_list\"]//*/div[2]/div/a//@href").all();
 47 
 48         for(String ur:Urls)
 49         {
 50             ur+="tag/";
 51             listUrl.add(ur);
 52         }
 53         listUrl.toArray(new String[listUrl.size()]);
 54         page.addTargetRequests(listUrl);
 55 
 56     }
 57 
 58     //jsoup根据html字符串和语法来获取内容
 59     private static String selectDocumentText(String htmlText,String Query) {
 60         Document doc=Jsoup.parse(htmlText);
 61         String select=doc.select(Query).text();
 62         return select;
 63     }
 64 
 65     //jsoup根据html字符串和语法获取链接地址
 66     private static String selectDocumentLink(String htmlText,String Query) {
 67         Document doc=Jsoup.parse(htmlText);
 68         String select=doc.select(Query).attr("href");
 69         return select;
 70     }
 71 
 72     @Override
 73     public Site getSite() {
 74         return Site.me().setSleepTime(1000).setRetryTimes(10);
 75     }
 76 
 77     //编写抽取逻辑
 78     @Override
 79     public void process(Page page) {
 80         nn=nn+1;
 81         if(nn==1)
 82         {
 83             page.addTargetRequests(urls());
 84         }
 85         if(page.getUrl().regex("https://www.cnblogs.com/sitehome/p/[0-9]+").match())
 86         {
 87             add_urls_child(page);
 88         }
 89 
 90         else
 91         {
 92             System.out.println("DDDDDD");
 93 
 94             String title=page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]//text()").get();
 95             String URL=page.getUrl().get();
 96             System.out.println("Title:\t"+title);
 97             System.out.println("URL:\t"+URL);
 98             List tags=new ArrayList();
 99             tags=page.getHtml().xpath("//*[@id=\"MyTag1_dtTagList\"]/tbody//a//text()").all();
100             for(String tag:tags)
101             {
102                 System.out.println(
103                         "TAG:\t"+tag
104                 );
105                 InfoDao.add("blog",tag,title,URL);
106             }
107 
108 
109         }
110 
111 
112 
113 
114  /*
115         //定义如何抽取页面信息
116 
117         List htmls=page.getHtml().xpath("//div[@class='post_item']/html()").all();
118 
119        // List javaBokes=new ArrayList();
120         for(String html:htmls) {
121           //  JavaBokeModel javaBoke=new JavaBokeModel();
122             //标题和链接
123             String title=selectDocumentText(html,TITLEQUERY);
124 
125             String linke=selectDocumentLink(html,TITLEQUERY);
126             //作者和作者主页
127             String author=selectDocumentText(html,AUTHORQUERY);
128 
129             System.out.println(
130                     "TITLE\t"+title+
131                             "Link\t"+linke+
132                             "Author\t"+author
133             );
134 
135 
136 
137         }
138  */
139         //File.WriteStringToFile2(javaBokes);
140 
141 
142     }
143 
144     public static void main(String[] args) {
145         long startTime,endTime;
146         //DBUtil.getConnection();
147         startTime=new Date().getTime();
148 
149         Spider create=Spider.create(new 博客园标签());
150         create.addUrl("http://www.cnblogs.com/").thread(5).run();
151         try {
152             ps.close();
153             conn.close();
154         }catch(Exception e) {
155 
156         }
157         endTime=new Date().getTime();
158         System.out.println("用时为:"+(endTime-startTime)/1000+"s");
159 
160     }
161 
162 }
标签代码

 

你可能感兴趣的:(博客园博文爬取 标签爬取(含源代码))