新入Java爬虫,记录一下爬取http://site.baidu.com/并分类存储的思路和实现代码。
实现思路:
利用Idea+maven构建Web项目这里就不赘述了,配置出现问题的小伙伴欢迎留言交流~
项目结构如下
public class HttpToBeClient {
public Document Get_page(String url) throws Exception {
// new httpclient实例并设置cookie属性
RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.IGNORE_COOKIES).build();
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig).build();
// 新建httpget对象并传入url
HttpGet httpGet = new HttpGet(url);
// 设置请求头
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
// 发起请求
CloseableHttpResponse response = httpClient.execute(httpGet)
// 实例对象并设置编码格式
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity, "utf-8");
// 关闭连接
response.close();
httpClient.close();
// Jsoup解析并返回document对象
Document document = Jsoup.parse(content);
return document;
}
}
打开开发者模式观察页面结构大概可以分成这么几块,接下来就是分别对这几块数据进行解析和提取
package HttpSpider;
import Dao.Page;
import Dao.SelectPageDao;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class BaiduCilent {
public static void SearchTab(String div_name) throws Exception {
HttpToBeClient toBeClient = new HttpToBeClient();
Document document = toBeClient.Get_page("http://site.baidu.com/");
Elements tabs = document.select(div_name);
for (Element element : tabs
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String href = element1.attr("href");
String title = element1.html();
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(element1.firstElementSibling().html(), href, title));
}
}
System.out.println("--------------------------------------这是一条小尾巴--------------------------------------------");
}
public static void HotSearch(String div_name) throws Exception {
HttpToBeClient toBeClient = new HttpToBeClient();
Document document = toBeClient.Get_page("http://site.baidu.com/");
Elements tabs = document.select(div_name);
for (Element element : tabs
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String href = element1.attr("href");
String title = element1.html();
String type = "热搜推荐";
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(type, href, title));
}
}
}
public static void RealModMt10(String div_name) throws Exception {
HttpToBeClient toBeClient = new HttpToBeClient();
Document document = toBeClient.Get_page("http://site.baidu.com/");
Elements tabs = document.select(div_name);
for (Element element : tabs
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String href = element1.attr("href");
String title = element1.html();
String type = "实时热搜";
if (href.equals("javascript:;")) {
continue;
}
if (title.equals(type)) {
continue;
}
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(type, href, title));
}
}
}
public static void TabWrap(String div_name) throws Exception {
HttpToBeClient toBeClient = new HttpToBeClient();
Document document = toBeClient.Get_page("http://site.baidu.com/");
Elements tabs = document.select(div_name);
Elements hd = tabs.select("div.hd");
Elements bd = tabs.select("div.bd");
Elements bangdan1 = hd.select("a[monkey^=\"bangdan-yi\"]");
List bangdan12 = new ArrayList<>();
List bangdan34 = new ArrayList<>();
List bangdan11 = new ArrayList<>();
List bangdan22 = new ArrayList<>();
List bangdan33 = new ArrayList<>();
List bangdan44 = new ArrayList<>();
for (Element e : bangdan1
) {
for (int i = 0; i < 6; i++) {
bangdan12.add(e.text());
}
}
Elements bangdan2 = hd.select("a[monkey^=\"bangdan-er\"]");
for (Element e : bangdan2
) {
for (int i = 0; i < 6; i++) {
bangdan12.add(e.text());
}
}
Elements bangdan3 = hd.select("a[monkey^=\"bangdan-san\"]");
for (Element e : bangdan3
) {
for (int i = 0; i < 10; i++) {
bangdan34.add(e.text());
}
}
Elements bangdan4 = hd.select("a[monkey^=\"bangdan-si\"]");
for (Element e : bangdan4
) {
for (int i = 0; i < 10; i++) {
bangdan34.add(e.text());
}
}
Elements bangdanyi = bd.select("div[monkey^=\"bangdan-yi\"]");
for (Element e : bangdanyi
) {
Elements contents = e.getElementsByTag("a");
for (Element e1 : contents
) {
String link = e1.attr("href");
String title = e1.html();
bangdan11.add(link);
bangdan22.add(title);
}
}
Elements bangdaner = bd.select("div[monkey^=\"bangdan-er\"]");
Elements bangdanera = bangdaner.select("a[class^=\"clr-black\"]");
for (Element e : bangdanera
) {
Elements contents = e.getElementsByTag("a");
for (Element e1 : contents
) {
String link = e1.attr("href");
String title = e1.html();
bangdan11.add(link);
bangdan22.add(title);
}
}
Elements bangdansan = bd.select("div[monkey^=\"bangdan-san\"]");
for (Element e : bangdansan
) {
Elements contents = e.getElementsByTag("a");
for (Element e1 : contents
) {
String link = e1.attr("href");
String title = e1.html();
bangdan33.add(link);
bangdan44.add(title);
}
}
Elements bangdansi = bd.select("div[monkey^=\"bangdan-si\"]");
for (Element e : bangdansi
) {
Elements contents = e.getElementsByTag("a");
for (Element e1 : contents
) {
String link = e1.attr("href");
String title = e1.html();
bangdan33.add(link);
bangdan44.add(title);
}
}
for (int i = 0; i < bangdan11.size(); i++) {
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(bangdan12.get(i), bangdan11.get(i), bangdan22.get(i)));
}
for (int i = 0; i < bangdan33.size(); i++) {
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(bangdan34.get(i), bangdan33.get(i), bangdan44.get(i)));
}
}
public static void SitesModMt10(String div_name) throws Exception {
HttpToBeClient toBeClient = new HttpToBeClient();
Document document = toBeClient.Get_page("http://site.baidu.com/");
Elements tabs = document.select(div_name);
for (Element element : tabs
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String href = element1.attr("href");
String title = element1.html();
String type = "常用站点";
if (href.equals("javascript:;")) {
continue;
}
if (title.equals(type)) {
continue;
}
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(type, href, title));
}
}
}
public static void ToolsMod(String div_name) throws Exception {
HttpToBeClient toBeClient = new HttpToBeClient();
Document document = toBeClient.Get_page("http://site.baidu.com/");
Elements tabs = document.select(div_name);
for (Element element : tabs
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String href = element1.attr("href");
String title = element1.html();
String type = "实用工具";
if (href.equals("javascript:;")) {
continue;
}
if (title.equals(type)) {
continue;
}
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(type, href, title));
}
}
}
public static void CoolsMod(String div_name) throws Exception {
HttpToBeClient toBeClient = new HttpToBeClient();
Document document = toBeClient.Get_page("http://site.baidu.com/");
Elements tabs = document.select(div_name);
Elements titles = tabs.select("a.clr-blue");
Elements contents = tabs.select("a.icon.clr-black");
List titles_list = new ArrayList<>();
List contents_list = new ArrayList<>();
List links_list = new ArrayList<>();
for (Element element : titles
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String title = element1.html().replace(" ","");
if (title.equals("更多>>")) {
continue;
}
if (title.equals("软件") | title.equals("金融")) {
for (int i = 0; i < 5; i++) {
contents_list.add(title);
}
} else {
for (int i = 0; i < 6; i++) {
contents_list.add(title);
}
}
}
}
System.out.println(contents_list);
for (Element element : contents
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String href = element1.attr("href");
String title = element1.html();
links_list.add(href);
titles_list.add(title);
}
}
for (int i = 0; i < contents_list.size(); i++) {
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(contents_list.get(i), links_list.get(i), titles_list.get(i)));
}
}
public static void ContentBottomMt10(String div_name) throws Exception {
HttpToBeClient toBeClient = new HttpToBeClient();
Document document = toBeClient.Get_page("http://site.baidu.com/");
Elements tabs = document.select(div_name);
Elements titles = tabs.select("a.clr-blue");
Elements tags = tabs.select("a.clr-gray");
List titles_list = new ArrayList<>();
List tags_list = new ArrayList<>();
List links_list = new ArrayList<>();
for (Element element : titles
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String title = element1.html();
for (int i = 0; i < 24; i++) {
titles_list.add(title);
}
}
}
System.out.println(titles_list);
for (Element element : tags
) {
Elements links = element.getElementsByTag("a");
for (Element element1 : links
) {
String href = element1.attr("href");
String title = element1.html();
links_list.add(href);
tags_list.add(title);
}
}
for (int i = 0; i < titles_list.size(); i++) {
SelectPageDao dao = new SelectPageDao();
dao.add(new Page(titles_list.get(i), links_list.get(i), tags_list.get(i)));
}
}
public static void main(String[] args) throws Exception {
/*
顶部导航栏
*/
SearchTab("div.search-tab");
/*
热搜
*/
HotSearch("div.pages");
HotSearch("div.hot-searches");
/*
新闻
*/
TabWrap("div.tab-wrap");
RealModMt10("div.real.mod.mt10");
/*
常用站点
*/
SitesModMt10("div.sites.mod.mt10");
/*
猜你喜欢
*/
/*
实用工具
*/
ToolsMod("div.tools.mod");
/*
站点导航
*/
CoolsMod("div.cools.mod");
/*
底部导航栏
*/
ContentBottomMt10("div.content-bottom.mt10");
}
}
解析按照每个元素有分类、链接、内容三个维度。
import java.sql.*;
public class ConnectDb {
private static String driver = "com.mysql.cj.jdbc.Driver";
private static String url = "jdbc:mysql://localhost:3306/WebSpider?characterEncoding=UTF8";
private static String user = "root";
private static String password = "123456";
/*
连接数据库
*/
static {
try {
Class.forName(driver);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
public static Connection getConnection() throws SQLException {
return DriverManager.getConnection(url, user, password);
}
public static void closeAll(Connection conn, Statement stat, ResultSet rs) throws SQLException {
/*
关闭连接,释放资源
*/
if (rs != null) {
rs.close();
}
if (stat != null) {
stat.close();
}
if (conn != null) {
conn.close();
}
}
数据库表字段设计
public class Page {
private int Id;
private String Type;
private String Link;
private String Content;
public Page() {
super();
}
public Page(String type, String link, String content) {
this.Type = type;
this.Link = link;
this.Content = content;
}
public int getId() {
return Id;
}
public void setId(int id) {
Id = id;
}
public String getType() {
return Type;
}
public void setType(String type) {
Type = type;
}
public String getLink() {
return Link;
}
public void setLink(String link) {
Link = link;
}
public String getContent() {
return Content;
}
public void setContent(String content) {
Content = content;
}
}
index.jsp部分代码
分类
标题
链接
<%
List list = (List) request.getAttribute("list");
for (Page page1 : list) {
%>
<%=page1.getType()%>
<%=page1.getContent()%>
戳我!
<%
}
%>
上层表单name=“ServletPage” method=“post”
新建ServletPage.servlet
public class PageServlet extends HttpServlet {
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
try {
request.setCharacterEncoding("utf-8");
response.setCharacterEncoding("utf-8");
String value = request.getParameter("states");
System.out.println(value);
SelectPageDao dao = new SelectPageDao();
List list = dao.findByType(value);
request.setAttribute("list",list);
request.getRequestDispatcher("index1.jsp").forward(request,response);
}catch (Exception e){
e.printStackTrace();
}
}
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
doPost(request,response);
}
}
数据库查询方法
public class SelectPageDao extends ConnectDb implements PageDao {
@Override
public List findAllPages() throws Exception {
Connection conn = ConnectDb.getConnection();
String sql = "select * from sitebd";
PreparedStatement pstmt = conn.prepareStatement(sql);
ResultSet rs = pstmt.executeQuery();
List bdpage = new ArrayList<>();
while (rs.next()) {
Page page = new Page();
page.setType(rs.getString("type"));
page.setLink(rs.getString("link"));
page.setContent(rs.getString("content"));
bdpage.add(page);
}
ConnectDb.closeAll(conn, pstmt, rs);
return bdpage;
}
@Override
public List findByType(String type) throws Exception {
Connection conn = ConnectDb.getConnection();
String sql = "select * from sitebd where type = ?";
PreparedStatement pstmt = conn.prepareStatement(sql);
pstmt.setString(1, type);
ResultSet rs = pstmt.executeQuery();
List bdpage = new ArrayList<>();
while (rs.next()) {
Page page = new Page();
page.setType(rs.getString("type"));
page.setLink(rs.getString("link"));
page.setContent(rs.getString("content"));
bdpage.add(page);
}
ConnectDb.closeAll(conn, pstmt, rs);
return bdpage;
}
第一个是全部查询,第二个是根据前台传过来的数据作为Key查询
web.xml还应该如下配置:
PageServlet
Controller.PageServlet
PageServlet
/PageServlet
启动tomcat服务器
可以看到数据库中之前结构化存储的数据已经根据选择好的标签查询到了。
因篇幅问题,无法放出所有的代码,代码上传到我的https://github.com/airenle/sitesbd
上,新人水平有限,有问题大家一起交流哈~