编写一个爬虫采集数据源
使用jsoup 爬取招聘网站相应的招聘信息
实现代码:
1.jobBean.java
public class JobBean {
private String jobName;
private String comName;
private String addr;
private String salary;
private String date;
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public String getComName() {
return comName;
}
public void setComName(String comName) {
this.comName = comName;
}
public String getAddr() {
return addr;
}
public void setAddr(String addr) {
this.addr = addr;
}
public String getSalary() {
return salary;
}
public void setSalary(String salary) {
this.salary = salary;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
@Override
public String toString() {
return "JobBean [jobName=" + jobName + ", comName=" + comName + ", addr=" + addr + ", salary=" + salary
+ ", date=" + date + "]";
}
public void set(String jobName, String comName, String addr, String salary, String date) {
this.jobName = jobName;
this.comName = comName;
this.addr = addr;
this.salary = salary;
this.date = date;
}
}
2.page.java
import org.jsoup.nodes.Document;
public class Page {
private Document document;
private String nextPageUrl;
private boolean hasNextPage;
public String getNextPageUrl() {
return nextPageUrl;
}
public void setNextPageUrl(String nextPageUrl) {
this.nextPageUrl = nextPageUrl;
}
public boolean isHasNextPage() {
return hasNextPage;
}
public void setHasNextPage(boolean hasNextPage) {
this.hasNextPage = hasNextPage;
}
public Document getDocument() {
return document;
}
public void setDocument(Document document) {
this.document = document;
}
@Override
public String toString() {
return "Page [document=" + document + ", nextPageUrl=" + nextPageUrl + ", hasNextPage=" + hasNextPage + "]";
}
}
3.TestJsoup.java
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class TestJsoup {
public static void main(String[] args) throws Exception {
Document document = getDocumentByUrl();
getValue(document);
// 北京拓尔思信息技术股份有限公司
}
/**
* 获取具体的值 1:标签对之间的值 2:标签的属性值
*
* @param document
*/
public static void getValue(Document document) {
Elements select = document.select("#resultList .el a");
for (Element element : select) {
// 标签对中间的值
// System.out.println(element.text());
String attr = element.attr("href");
System.out.println(attr);
}
}
/**
* 查找具体的数据有三种方式 1:通过id查找数据 2:通过class查找数据 3:通过标签查找数据(tag)
*
* select里面直接传递就是上面的三种形式
*
* @param document
*/
public static void getElements(Document document) {
// 通过id查找所需要的数据 id是唯一的,所找到的数据是单个的,也是具体的
Element elementById = document.getElementById("resultList");
// System.out.println(elementById);
// 通过class查找数据
Elements elementsByClass = document.getElementsByClass("el");
/*
* for (Element element : elementsByClass) {
* System.out.println(element);
* System.out.println("------------------"); }
*/
// 通过标签查找数据
Elements elementsByTag = document.getElementsByTag("a");
/*
* for (Element element : elementsByTag) { System.out.println(element);
* System.out.println("------------------"); }
*/
// css选择器
Elements select = document.select("#resultList .el");
for (Element element : select) {
System.out.println(element);
System.out.println("-----------");
}
}
/**
* 通过父子标签和兄弟标签的到数据
*
* @param document
*/
public static void getElementsByC(Document document) {
Elements select = document.select("#resultList .el span");
for (Element element : select) {
Elements children = element.children();
Element parent = element.parent();
// 得到所有的兄弟标签
element.siblingElements();
// 得到下面的兄弟标签
element.nextSibling();
// 得到前面的兄弟标签
element.previousElementSibling();
for (Element element2 : children) {
System.out.println(element2);
}
}
}
/**
* 通过url爬取整个网站的信息
*
* @return
* @throws MalformedURLException
* @throws IOException
*/
private static Document getDocumentByUrl() throws MalformedURLException, IOException {
// 第一个是url 第二个超时时间
String urlStr = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
URL url = new URL(urlStr);
// 返回的整个爬取的数据
Document document = Jsoup.parse(url, 4000);
// System.out.println(document);
return document;
}
}
4.TestMain.java
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class TestMain {
public static void main(String[] args) throws Exception {
String usrEnd = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,145.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
String url = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
Document document = getDocumentByUrl(url);
Page page = new Page();
page.setDocument(document);
int sum = 0;
BufferedWriter bw = new BufferedWriter(new FileWriter("D:/大数据.txt"));
while (true) {
sum++;
List jobsByPage = getJobsByPage(page);
getNextPageUrl(page);
for (JobBean jobBean : jobsByPage) {
//System.out.println(jobBean);
if(jobBean.toString()!=null){
bw.write(jobBean.toString());
bw.newLine();
bw.flush();
}else{
continue;
}
}
System.out.println("——————————————" + sum + "——————————————");
if (page.isHasNextPage()) {
document = getDocumentByUrl(page.getNextPageUrl());
page.setDocument(document);
} else {
break;
}
// Thread.sleep(1000);
}
}
public static void getNextPageUrl(Page page) {
Document document = page.getDocument();
Elements select = document.select(".bk");
Elements select2 = select.get(1).select("a");
if (select2 != null && select2.size() > 0) {
Element element = select2.get(0);
String url = element.attr("href");
page.setNextPageUrl(url);
page.setHasNextPage(true);
} else {
page.setHasNextPage(false);
}
}
private static Document getDocumentByUrl(String url) {
URL u;
try {
u = new URL(url);
Document document = Jsoup.parse(u, 4000);
return document;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
private static List getJobsByPage(Page page) {
List list = new ArrayList<>();
Document document = page.getDocument();
Elements select = document.select("#resultList .el");
select.remove(0);
for (Element element : select) {
String jobName = element.select(".t1 a").get(0).text();
String comName = element.select(".t2 a").get(0).attr("title");
String addr = element.select(".t3").get(0).text();
String salary = element.select(".t4").get(0).text();
String date = element.select(".t5").get(0).text();
JobBean jobBean = new JobBean();
jobBean.set(jobName, comName, addr, salary, date);
list.add(jobBean);
}
return list;
}
}