Java爬虫(02)——抓取finalUrl页面信息

Java爬虫(02)——抓取finalUrl页面信息_第1张图片
抓取页面信息

实现:

package com.sichan.one;

import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.sichan.bean.Link;
import com.sichan.util.DBUtil;
import com.sichan.util.JsoupUtil;

public class GetLink {
    
    private JsoupUtil ju = JsoupUtil.getInstance();
    private DBUtil du = DBUtil.getInstance();
    private Link link = new Link();
    private String insertSql = "";
    
    public void getLink(String url){
        try {
            
            Document document = Jsoup.connect(url).get();
            Elements contents = document.select("body");

            Elements postEl = contents.select(".top-fixed-box h1");
            Elements company_nameEl = contents.select(".top-fixed-box h2");
            Elements salaryEl = contents.select("li:matches(职位月薪)");
            Elements addressEl = contents.select("li:matches(工作地点)");
            Elements release_timeEl = contents.select("li:matches(发布日期)");
            Elements job_natureEl = contents.select("li:matches(工作性质)");
            Elements experienceEl = contents.select("li:matches(工作经验)");
            Elements edu_backgroundEl = contents.select("li:matches(最低学历)");
            Elements recruiting_numEl = contents.select("li:matches(招聘人数)");
            Elements job_categoryEl = contents.select("li:matches(职位类别)");
            Elements company_scaleEl = contents.select("li:matches(公司规模)");
            Elements company_natureEl = contents.select("li:matches(公司性质)");
            Elements company_industryEl = contents.select("li:matches(公司行业)");

            SubLink sublink = new SubLink();
            
            link.setNum(GetHtml.num);
            link.setPost(postEl.text());
            link.setCompany_name(company_nameEl.text());
            
            sublink.subSalary(salaryEl.text());
            link.setLow_salary(sublink.low_salary);
            link.setHigh_salary(sublink.high_salary);
//          System.out.println(salaryEl.text());
            
            sublink.subAddress(addressEl.text());
            link.setAddress(sublink.subAddress);
            
            sublink.subRelease_time(release_timeEl.text());
            link.setRelease_time(sublink.subRelease_time);
            
            sublink.subJob_nature(job_natureEl.text());
            link.setJob_nature(sublink.subJob_nature);
            
            sublink.subExperience(experienceEl.text());
            link.setExperience(sublink.subExperience);
            
            sublink.subEdu_background(edu_backgroundEl.text());
            link.setEdu_background(sublink.subEdu_background);
            
            sublink.subRecruiting_num(recruiting_numEl.text());
            link.setRecruiting_num(sublink.subRecruiting_num);
            
            sublink.subJob_category(job_categoryEl.text());
            link.setJob_category(sublink.subJob_category);
//          System.out.println(sublink.subJob_category);
            
            sublink.subCompany_scale(company_scaleEl.text());
            link.setCompany_scale(sublink.subCompany_scale);
            
            sublink.subCompany_nature(company_natureEl.text());
            link.setCompany_nature(sublink.subCompany_nature);
            
            sublink.subCompany_industry(company_industryEl.text());
            link.setCompany_industry(sublink.subCompany_industry);
            
            System.out.println(link.toString());
//          insertSql = ju.getInsertSql(link);
//          du.insert(insertSql);
            
        } catch (IOException e) {
            e.printStackTrace();
        } 
    }
}
package com.sichan.one;

public class SubLink {

    double low_salary;
    double high_salary;
    String subAddress;
    String subRelease_time;
    String subJob_nature;
    String subExperience;
    String subEdu_background;
    Integer subRecruiting_num;
    String subJob_category;
    String subCompany_scale;
    String subCompany_nature;
    String subCompany_industry;
    /**
     * 对抓取字符串类型的月薪转换成数值类型
     * @param salary
     */
    public void subSalary(String salary){
        String salary01;
        if(salary.indexOf("以")!=-1){
            salary01 = salary.substring(5, salary.length()-6);
            low_salary = Double.valueOf(salary01);
            high_salary = Double.valueOf(salary01);
        }else if(salary.indexOf("元")!=-1){
            salary01 = salary.substring(5, salary.length()-4);
            String[] salarys = salary01.split("-");
            String low = salarys[0];
            String high = salarys[1];
            double low_salary01 = Double.valueOf(low);
            if(low_salary01 % 2 == 0){
                low_salary = low_salary01;
            }else{
                low_salary = low_salary01-1;
            }
            high_salary = Double.valueOf(high);
        }else{
            low_salary = -1;
            high_salary = -1;
        }
    }
    /**
     * 对抓取的地址进行裁剪
     * @param address
     */
    public void subAddress(String address){
        subAddress = address.substring(5,7);   
    }
    public void subRelease_time(String release_time){
        subRelease_time = release_time.substring(5);   
    }
    public void subJob_nature(String job_nature){
        subJob_nature = job_nature.substring(5);   
    }
    public void subExperience(String experience){
        subExperience = experience.substring(5);   
    }
    public void subEdu_background(String edu_background){
        subEdu_background = edu_background.substring(5);   
    }
    /**
     * 抓取招聘人数并进行字符串转换为数值型
     * @param recruiting_num
     */
    public void subRecruiting_num(String recruiting_num){
        String recruiting_num01 = recruiting_num.substring(5); 
        if(recruiting_num01.indexOf("人")!=-1){
            String recruiting_num02 = recruiting_num01.substring(0,recruiting_num01.length()-1);
            subRecruiting_num = Integer.valueOf(recruiting_num02);
        }else{
            subRecruiting_num = -1;
        }
    }
    public void subJob_category(String job_category){
        if(job_category.indexOf("您也许对")!=-1){
            subJob_category = job_category.substring(5,job_category.length()-15); 
        }else{
            subJob_category = job_category.substring(5); 
        }
  
    }
    public void subCompany_scale(String company_scale){
        subCompany_scale = company_scale.substring(5);   
    }
    public void subCompany_nature(String company_nature){
        subCompany_nature = company_nature.substring(5);   
    }
    public void subCompany_industry(String company_industry){
        subCompany_industry = company_industry.substring(5);   
    }
}
package com.sichan.bean;

import java.io.Serializable;

public class Link implements Serializable{
    
    private static final long serialVersionUID = 1165098694307553167L;
    
    private int num;
    private String post;  
    private String company_name;
    private double low_salary;
    private double high_salary;
    private String address;
    private String release_time;
    private String job_nature;
    private String experience;
    private String edu_background;
    private int recruiting_num;
    private String job_category;
    private String company_scale;
    private String company_nature;
    private String company_industry;

    public int getNum() {
        return num;
    }
    public void setNum(int num) {
        this.num = num;
    }
    public String getPost() {
        return post;
    }
    public void setPost(String post) {
        this.post = post;
    }
    public String getCompany_name() {
        return company_name;
    }
    public void setCompany_name(String company_name) {
        this.company_name = company_name;
    }
    public double getLow_salary() {
        return low_salary;
    }
    public void setLow_salary(double low_salary) {
        this.low_salary = low_salary;
    }
    public double getHigh_salary() {
        return high_salary;
    }
    public void setHigh_salary(double high_salary) {
        this.high_salary = high_salary;
    }
    public String getAddress() {
        return address;
    }
    public void setAddress(String address) {
        this.address = address;
    }
    public String getRelease_time() {
        return release_time;
    }
    public void setRelease_time(String release_time) {
        this.release_time = release_time;
    }
    public String getJob_nature() {
        return job_nature;
    }
    public void setJob_nature(String job_nature) {
        this.job_nature = job_nature;
    }
    public String getExperience() {
        return experience;
    }
    public void setExperience(String experience) {
        this.experience = experience;
    }
    public String getEdu_background() {
        return edu_background;
    }
    public void setEdu_background(String edu_background) {
        this.edu_background = edu_background;
    }
    public String getCompany_scale() {
        return company_scale;
    }
    public void setCompany_scale(String company_scale) {
        this.company_scale = company_scale;
    }
    public String getCompany_nature() {
        return company_nature;
    }
    public void setCompany_nature(String company_nature) {
        this.company_nature = company_nature;
    }
    public int getRecruiting_num() {
        return recruiting_num;
    }
    public void setRecruiting_num(int recruiting_num) {
        this.recruiting_num = recruiting_num;
    }
    public String getJob_category() {
        return job_category;
    }
    public void setJob_category(String job_category) {
        this.job_category = job_category;
    }
    public String getCompany_industry() {
        return company_industry;
    }
    public void setCompany_industry(String company_industry) {
        this.company_industry = company_industry;
    }
    @Override
    public String toString() {
        return "Link [num=" + num + ", post=" + post + ", company_name=" + company_name + ", low_salary=" + low_salary
                + ", high_salary=" + high_salary + ", address=" + address + ", release_time=" + release_time
                + ", job_nature=" + job_nature + ", experience=" + experience + ", edu_background=" + edu_background
                + ", recruiting_num=" + recruiting_num + ", job_category=" + job_category + ", company_scale="
                + company_scale + ", company_nature=" + company_nature + ", company_industry=" + company_industry + "]";
    }
}

输出结果:


Java爬虫(02)——抓取finalUrl页面信息_第2张图片
抓取的信息

你可能感兴趣的:(Java爬虫(02)——抓取finalUrl页面信息)