java爬取前程无忧(51job)

什么是Jsoup

jsoup是一款优秀的Java的HTML解析器,主要用来对HTML解析,就是dom的操作,有很多和js方法一样,如getElementById,select等,使用起来非常的方便,不清楚的朋友可以去学习下这里就不强调了。

爬取前程无忧准备工作

1.创建一个普通的maven工程
java爬取前程无忧(51job)_第1张图片
2.在 pom.xml文件中引入依赖

 <dependencies>
  
    <dependency>
      <groupId>org.apache.httpcomponentsgroupId>
      <artifactId>httpclientartifactId>
      <version>4.5.9version>
    dependency>

    <dependency>
      <groupId>org.apache.httpcomponentsgroupId>
      <artifactId>httpcoreartifactId>
      <version>4.4.11version>
    dependency>

    <dependency>
      <groupId>org.jsoupgroupId>
      <artifactId>jsoupartifactId>
      <version>1.12.1version>
    dependency>
  dependencies>

分析51job网站

java爬取前程无忧(51job)_第2张图片
如何获取所有的城市和代码?
java爬取前程无忧(51job)_第3张图片
java爬取前程无忧(51job)_第4张图片

开始爬取51job

观察51job的字段创建对应的实体类表
java爬取前程无忧(51job)_第5张图片
创建Jobs实体类

public class Jobs {
    private Integer jobId;//自动增长id
   private String jobName; //岗位
   private String companyName;//公司名
   private String workAddr;//公司地址
   private String salary;//薪水
   private String pushDate;//发布日期
   private String jobKey;//对应城市的id
   public Integer getJobId() {
      return jobId;
   }

   public void setJobId(Integer jobId) {
      this.jobId = jobId;
   }

   public String getJobName() {
      return jobName;
   }

   public void setJobName(String jobName) {
      this.jobName = jobName;
   }

   public String getCompanyName() {
      return companyName;
   }

   public void setCompanyName(String companyName) {
      this.companyName = companyName;
   }

   public String getWorkAddr() {
      return workAddr;
   }

   public void setWorkAddr(String workAddr) {
      this.workAddr = workAddr;
   }

   public String getSalary() {
      return salary;
   }

   public void setSalary(String salary) {
      this.salary = salary;
   }

   public String getPushDate() {
      return pushDate;
   }

   public void setPushDate(String pushDate) {
      this.pushDate = pushDate;
   }

   public String getJobKey() {
      return jobKey;
   }

   public void setJobKey(String jobKey) {
      this.jobKey = jobKey;
   }

创建JobParse

public class JobParse {

    public static List<Jobs> getData(String entity){
        /**
         * 读取mybatis配置文件
         */
        String resource = "mybatis-config.xml";
        InputStream inputStream = null;
        try {
            inputStream = Resources.getResourceAsStream(resource);
        } catch (IOException e) {
            e.printStackTrace();
        }
        /**
         * 得到连接对象注册sqlsession
         */
        SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream);

        SqlSession sqlSession = sqlSessionFactory.openSession();

        JobsMapper jobsMapper = sqlSession.getMapper(JobsMapper.class);

         List<Jobs> data = new ArrayList<Jobs>();
         Document doc = Jsoup.parse(entity);
         Elements elements = doc.select("div.el");
         Elements title =  elements.select("p.t1").select("span").select("a"); //标题
         Elements complany = elements.select("span.t2").select("a"); //公司
         Elements address = elements.select("span.t3");//地址
         Elements salary = elements.select("span.t4");//薪水
         Elements datas = elements.select("span.t5");//发布日期
         Elements SrcId = elements.select("p.t1").select("input.checkbox");//招聘信息对应的id
       
         Jobs jobs = new Jobs();

         if (title !=null || title.equals("")) {
             for (Element element : title) {
                 jobs.setJobName(element.text());
             }
         }

         if (complany !=null || complany.equals("")) {
             for (Element element : complany) {
                 jobs.setCompanyName(element.text());
             }
         }

         if (address !=null || address.equals("")) {
             for (Element element : address) {
                 jobs.setWorkAddr(element.text());
             }
         }

         if (salary !=null || salary.equals("")) {
             for (Element element : salary) {
                 jobs.setSalary(element.text());
             }
         }

         if (datas !=null || datas.equals("")) {
             for (Element element : datas) {
                 jobs.setPushDate(element.text());
             }
         }
         if (SrcId !=null || SrcId.equals("")) {
             for (Element element : SrcId) {
                 jobs.setJobKey(element.attr("value"));
             }
         }
        jobsMapper.insert(jobs);
        sqlSession.commit();

        data.add(jobs);
        return data;
    }

创建爬取主启动程序JobMain

public class JobMain {

    public static void main(String[] args) {

        System.out.println("正在生成客户端...");
        HttpClient client = null;
        System.out.println("客户端生成完毕.");


        String[] city = {"阿坝", "阿克苏", "阿拉尔", "阿拉善盟", "阿勒泰", "阿里", "鞍山",
                "安康", "安庆", "安顺", "安阳", "巴彦淖尔", "巴音郭楞", "巴中",
                "白城", "白沙", "白山", "白银", "百色", "蚌埠", "包头",
                "保定", "保山", "保亭", "宝鸡", "北海", "北京", "本溪",
                "毕节", "滨州", "博尔塔拉", "亳州", "沧州", "昌都", "昌吉",
                "昌江", "常德", "常熟", "常州", "长春", "长沙", "长治",
                "朝阳", "潮州", "郴州", "成都", "澄迈", "承德", "池州",
                "赤峰", "崇左", "滁州", "楚雄", "重庆", "达州", "大理", "大连", "大庆", "大同", "大兴安岭", "丹东",
                "丹阳", "德宏", "德阳", "德州", "邓州", "迪庆", "定安",
                "定西", "东方", "东营", "东莞", "儋州", "鄂尔多斯", "鄂州",
                "恩施", "防城港", "佛山", "福州", "抚顺", "抚州", "阜新",
                "阜阳", "甘南", "甘孜", "赣州", "固原", "广安", "广元",
                "广州", "桂林", "贵港", "贵阳", "果洛", "哈尔滨", "哈密", "海北", "海东", "海口", "海南", "海宁",
                "海西", "邯郸", "汉中", "杭州", "菏泽", "和田", "合肥",
                "河池", "河源", "鹤壁", "鹤岗", "贺州", "黑河", "衡水",
                "衡阳", "红河州", "呼和浩特", "呼伦贝尔", "葫芦岛", "湖州", "怀化",
                "淮安", "淮北", "淮南", "黄冈", "黄南", "黄山", "黄石",
                "惠州", "鸡西", "吉安", "吉林", "济南", "济宁", "济源", "嘉兴",
                "嘉峪关", "佳木斯", "江门", "焦作", "揭阳", "金昌", "金华",
                "锦州", "晋城", "晋中", "荆门", "荆州", "景德镇", "靖江",
                "九江", "酒泉", "喀什地区", "开封", "开平", "克拉玛依", "克孜勒苏柯尔克孜",
                "昆明", "昆山", "拉萨", "莱芜", "来宾", "兰州", "廊坊", "乐山", "丽江",
                "丽水", "连云港", "凉山", "聊城", "辽阳", "辽源", "林芝",
                "临沧", "临汾", "临高", "临夏", "临沂", "陵水", "柳州",
                "六安", "六盘水", "龙岩", "陇南", "娄底", "吕梁", "洛阳",
                "泸州", "漯河", "马鞍山", "茂名", "梅州", "眉山", "绵阳",
                "牡丹江", "那曲", "南昌", "南充", "南京", "南宁", "南平",
                "南通", "南阳", "内江", "宁波", "宁德", "怒江", "攀枝花", "盘锦", "萍乡", "平顶山", "平凉", "莆田", "普洱",
                "濮阳", "七台河", "齐齐哈尔", "黔东南", "黔南", "黔西南", "潜江",
                "钦州", "秦皇岛", "青岛", "清远", "庆阳", "琼海", "琼中",
                "曲靖", "泉州", "衢州", "日喀则", "日照", "三门峡", "三明", "三沙", "三亚", "山南", "汕头", "汕尾",
                "商洛", "商丘", "上海", "上饶", "韶关", "邵阳", "绍兴",
                "深圳", "神农架", "沈阳", "十堰", "石河子", "石家庄", "石嘴山",
                "双鸭山", "朔州", "四平", "松原", "苏州", "宿迁", "宿州",
                "随州", "绥化", "遂宁", "塔城", "台州", "泰安", "泰兴",
                "泰州", "太仓", "太原", "唐山", "天津", "天门", "天水",
                "铁岭", "通化", "通辽", "铜川", "铜陵", "铜仁", "图木舒克",
                "吐鲁番", "屯昌", "万宁", "威海", "潍坊", "渭南", "温州", "文昌", "文山",
                "乌海", "乌兰察布", "乌鲁木齐", "无锡", "芜湖", "梧州", "吴忠",
                "武汉", "武威", "五家渠", "五指山", "西安", "西昌", "西宁",
                "西双版纳", "锡林郭勒盟", "厦门", "仙桃", "咸宁", "咸阳", "襄阳",
                "湘潭", "湘西", "孝感", "新乡", "新余", "忻州", "信阳",
                "兴安盟", "邢台", "雄安新区", "徐州", "许昌", "宣城", "乐东", "雅安", "烟台", "盐城", "延安", "延边", "延吉",
                "燕郊开发区", "杨凌", "扬州", "洋浦经济开发区", "阳江", "阳泉", "伊春",
                "伊犁", "宜宾", "宜昌", "宜春", "义乌", "益阳", "银川",
                "鹰潭", "营口", "永州", "榆林", "玉林", "玉树", "玉溪",
                "岳阳", "云浮", "运城", "枣庄", "湛江", "漳州", "张家港",
                "张家界", "张家口", "张掖", "昭通", "肇庆", "镇江", "郑州",
                "中山", "中卫", "舟山", "周口", "珠海", "株洲", "驻马店",
                "资阳", "淄博", "自贡", "遵义", "广东省", "江苏省", "浙江省", "四川省", "海南省", "福建省", "山东省",
                "江西省", "广西", "安徽省", "河北省", "河南省", "湖北省", "湖南省",
                "陕西省", "山西省", "黑龙江省", "辽宁省", "吉林省", "云南省", "贵州省",
                "甘肃省", "内蒙古", "宁夏", "西藏", "新疆", "青海省", "香港",
                "澳门", "台湾", "国外"};


        String[] value = {
                "092200","310600","310900","281500","311300",
                "300800","230400","201000","150400","260500","170900","280900","311800","092000","241000","101800","240900","270800","141100","150600","280400","160400","251200","101700","200400","140500","010000","231000","260700","121500","311900","151800",
                "160800","300600","311200","101900","190700","070700","070500","240200","190200","210600","231400","032000","190900","090200","101300","161000","151500","280300","141400","150900","251700","060000",
                "091700","250500","230300","220500","210400","221400","230800","072100","251600","090600","121300","172000","252000","101100","271100","100900","121000","030800","100800","280800","181000","181800","140800",
                "030600","110200","230600","131100","231500","150700","271500","092100","130800","290600","091300","091600","030200","140300","141000","260200","320800","220200","310700","320500","320300","100200","320700","081600","320400","160700","200900","080200",
                "121400","311600","150200","141200","032100","171700","221000","141500","221200","161200","190500","251000","280200","281100","230900","080900","191100","071900","151700","151100","181100","320600","151000","180400","030300",
                "220900","130900", "240300","120200","120900","171900","080700","270400","220800","031500","170500","032200","270300","080600","230700","210700","211000","180800","180700","130400","072500","130300","270500","310400","170400","032700","310300","311700","250200","070600",
                "300200","121800","141300","270200","160300","090400","250600","081000","071200","092300","121700","231100","240400","300400","251800","210500","101400","271400","120800","102100","140400","151200","260400","111000","271200","191200","211200",
                "170300","090500","171500","150500","032300","032600","091200","090300","220700","300700","130200","091100","070200","140200","110800","070900","170600","090900","080300","110900","251900",
                "091000","231300","130500","171000","271000","110600","251100","171600","221300","220600","260900","261000","260800","181500","140900","160600","120300","031900","271300","100600","101600","250300","110400","081200","300300","121200",
                "171800","110700","101500","100300","300500","030400","032400","201100","171300","020000","131200","031400","191000","080500","040000","181700","230200","180600","310800","160200","290500","221100","210900","240600","240700","070300","072000","151600","181200","220400","091500","311500",
                "080800","121100","072300","071800","071600","210200","160500","050000","181600","270600","231200","240500","280700","200500","150800","260600","311100","311400","101200","100700","120600","120500","200700","080400","100500","251400","281000","281200",
                "310200", "070400", "150300", "140700", "290300", "180200", "270700", "311000", "101000", "200200", "091900", "320200", "251500", "281400", "110300", "181400", "181300", "200300", "180500", "190400", "191500", "180900", "170700", "130600", "211100", "171200", "281300", "161100", "160100", "071100", "171100",
                "151400","102000","091800","120400","071300","200600","241100","240800","161300","201200","070800","100400","032800","210800","220300","310500","090700","180300","131000","081400","190800","290200","130700","230500","191300","200800","140600","320900","250400","190600","032900","210300","121600","031700","110500","071400","191400","160900","270900","251300","031800","071000","170200","030700",
                "290400","081100","170800","030500","190300","171400","091400","120700","090800","260300","030000","070000","080000","090000","100000","110000","120000","130000","140000","150000","160000","170000","180000","190000","200000","210000",
                "220000","230000","240000","250000","260000","270000","280000","290000","300000","310000","320000","330000","340000","350000","360000"
        };

        int pagesize = 1;
        boolean splider = true;
        for (int num = 0; num <410; num ++) {
            while (splider) {

                String url = "https://search.51job.com/list/"+ value[num] +  ",000000,0000,00,9,99," + city[num] + ",2," + pagesize++ + ".html";

                System.out.println(url);

                List<Jobs> jobsList = null;

                System.out.println("正在生成客户端...");
                client = HttpClientBuilder.create().build();
                System.out.println("客户端生成完毕.");

                //开始解析
                try {
                    System.out.println("开始响应客户端...");
                    try {
                        Thread.sleep(200);
                        jobsList = URLHandle.urlParser(client, url);

                       if (jobsList.iterator().next().getJob_name().equals("")){
                           pagesize = 1;
                          break;
                       }

                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    System.out.println("响应完成.");
                } catch (ParseException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }


                System.out.println("开始输出结果...");

                for (Jobs job : jobsList) {
                    System.out.println(job.getJob_name());

                }
                System.out.println("整个结果输出完毕,程序结束.");
            }


        }

    }

其中value和city是从51job获取的全部城名和城市代码,这里使用的是while循环,当jobsList中的某一个字段为空的时候说明后面已经没有信息了,里面结束当前循环,并且pagesize初始化为1,下一个城市从第一页开始。就这样反复的轮训city和value数组的城市名和代码来进行爬取,然后通过mybatis插入到数据库。

数据库配置

mapper

public interface JobsMapper {
    void insert(Jobs jobs);
    List<Jobs> findAll();
}

jdbc.properties

driver=com.mysql.jdbc.Driver
url=jdbc:mysql://localhost:3306/job51?ServerTimezone=GMT%2B8
username=root
password=root

mybatis-config.xml



<configuration>
    <properties resource="jdbc.properties">properties>
    <environments default="development">
        <environment id="development">
            <transactionManager type="JDBC"/>
            <dataSource type="POOLED">
                <property name="driver" value="${driver}"/>
                <property name="url" value="${url}"/>
                <property name="username" value="${username}"/>
                <property name="password" value="${password}"/>
            dataSource>
        environment>
    environments>
    <mappers>
        <mapper resource="JobsMapper.xml"/>
    mappers>
configuration>

mapper.xml


<mapper namespace="cn.com.scitc.mapper.JobsMapper">
    <resultMap id="JobsMapperMap" type="cn.com.scitc.model.Jobs">
        <id column="job_id" property="jobId" jdbcType="INTEGER"/>
        <id column="job_name" property="jobName" jdbcType="VARCHAR"/>
        <id column="company_name" property="companyName" jdbcType="VARCHAR"/>
        <id column="work_addr" property="workAddr" jdbcType="VARCHAR"/>
        <id column="salary" property="salary" jdbcType="VARCHAR"/>
        <id column="push_date" property="pushDate" jdbcType="VARCHAR"/>
        <id column="job_key" property="jobKey" jdbcType="VARCHAR"/>

    resultMap>

    <insert id="insert" keyColumn="jobId" useGeneratedKeys="true"  parameterType="cn.com.scitc.model.Jobs">
      insert into jobs (job_name,company_name,work_addr,salary,push_date,job_key) values (#{jobName},#{companyName},#{workAddr},#{salary},#{pushDate},#{jobKey} )
    insert>


    <select id="findAll" resultMap="JobsMapperMap">
        SELECT * FROM jobs
    select>
mapper>

java爬取前程无忧(51job)_第6张图片

项目地址

github

总结

爬取51job很简单,这里有个小坑就是该网站的编码是gbk,你没有听错就是gbk,转换成utf-8还不行,还是会乱码,所以这里使用gbk编码。

你可能感兴趣的:(爬虫,java,jsoup,爬虫,前程无忧,spider)