用Jsoup写一个简单的爬虫,并把从网页上面爬下的数据保存到数据库中

今天研究了Jsoup的一些基本用法,来与大家一块分享一下。我是先把从网页上面的数据做成了一个对象,然后把对象存进HashMap中,最后通过JDBC再保存到数据库中。

今天要进行爬虫的网站是智联招聘。每一条招聘信息都可以看成是一个对象。那么就要有一个自定义的JavaBeen类。

其中要用到两个jar包,分别是 :jsoup-1.10.3.jar 和 mysql-connector-java-5.1.39.jar 两个包。

package javabeen;

import java.util.Date;

/**
 * 工作自定义类
 * 
 * @author xml
 */
public class Job {
	private String position;// 职位
	private String company;// 公司名
	private String compensation;// 薪资
	private String workplace;// 工作地点
	private String date;// 发布日期
	private String education;// 学历
	private String experience;// 工作经验
	private String type;// 职位类别
	private String number;// 工作人数
	private String jobdescription;// 职位描述
	private String comdescription;// 公司描述
	public Job(String position, String company, String compensation, String workplace, String date, String education,
			String experience, String type, String number, String jobdescription, String comdescription) {
		super();
		this.position = position;
		this.company = company;
		this.compensation = compensation;
		this.workplace = workplace;
		this.date = date;
		this.education = education;
		this.experience = experience;
		this.type = type;
		this.number = number;
		this.jobdescription = jobdescription;
		this.comdescription = comdescription;
	}
	public Job() {
		super();
	}
	/**
	 * @return the position
	 */
	public String getPosition() {
		return position;
	}
	/**
	 * @param position the position to set
	 */
	public void setPosition(String position) {
		this.position = position;
	}
	/**
	 * @return the company
	 */
	public String getCompany() {
		return company;
	}
	/**
	 * @param company the company to set
	 */
	public void setCompany(String company) {
		this.company = company;
	}
	/**
	 * @return the compensation
	 */
	public String getCompensation() {
		return compensation;
	}
	/**
	 * @param compensation the compensation to set
	 */
	public void setCompensation(String compensation) {
		this.compensation = compensation;
	}
	/**
	 * @return the workplace
	 */
	public String getWorkplace() {
		return workplace;
	}
	/**
	 * @param workplace the workplace to set
	 */
	public void setWorkplace(String workplace) {
		this.workplace = workplace;
	}
	/**
	 * @return the date
	 */
	public String getDate() {
		return date;
	}
	/**
	 * @param date the date to set
	 */
	public void setDate(String date) {
		this.date = date;
	}
	/**
	 * @return the education
	 */
	public String getEducation() {
		return education;
	}
	/**
	 * @param education the education to set
	 */
	public void setEducation(String education) {
		this.education = education;
	}
	/**
	 * @return the experience
	 */
	public String getExperience() {
		return experience;
	}
	/**
	 * @param experience the experience to set
	 */
	public void setExperience(String experience) {
		this.experience = experience;
	}
	/**
	 * @return the type
	 */
	public String getType() {
		return type;
	}
	/**
	 * @param type the type to set
	 */
	public void setType(String type) {
		this.type = type;
	}
	/**
	 * @return the number
	 */
	public String getNumber() {
		return number;
	}
	/**
	 * @param number the number to set
	 */
	public void setNumber(String number) {
		this.number = number;
	}
	/**
	 * @return the jobdescription
	 */
	public String getJobdescription() {
		return jobdescription;
	}
	/**
	 * @param jobdescription the jobdescription to set
	 */
	public void setJobdescription(String jobdescription) {
		this.jobdescription = jobdescription;
	}
	/**
	 * @return the comdescription
	 */
	public String getComdescription() {
		return comdescription;
	}
	/**
	 * @param comdescription the comdescription to set
	 */
	public void setComdescription(String comdescription) {
		this.comdescription = comdescription;
	}
	/* (non-Javadoc)
	 * @see java.lang.Object#toString()
	 */
	@Override
	public String toString() {
		return "Job [position=" + position + ", company=" + company + ", compensation=" + compensation + ", workplace="
				+ workplace + ", date=" + date + ", education=" + education + ", experience=" + experience + ", type="
				+ type + ", number=" + number + ", jobdescription=" + jobdescription + ", comdescription="
				+ comdescription + "]";
	}

}
然后我们进行爬虫操作

package control;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.ListIterator;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


import javabeen.Job;
import utils.datautils;


public class Spider {
	
	static String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%83%91%E5%B7%9E&kw=java&sm=0&p=3";
	static int i = 0;
	public static void body1() throws IOException{
		
        Document doc = Jsoup.connect(url).get();
        
        Element element =  doc.select("div#newlist_list_content_table").first(); 	
//		System.out.println(element);	
        // 进一步获取table元素对应的对象 Elements
		Elements tables = element.select("table");
		HashMaphMap = new HashMap();
		ListIterator listIter = tables.listIterator(1 );
		while(listIter.hasNext()){
//			System.out.println(listIter.next());
			
			Element table = listIter.next();
			
			Element link = table.select("tr>td.zwmc").select("a").first();
			Element link1 = table.select("tr>td.gsmc").select("a").first();
			Element link2 = table.select("tr>td.zwyx").first();
			Element link3 = table.select("tr>td.gzdd").first();
			Element link4 = table.select("tr>td.gxsj").select("span").first();
			Element link5 = table.select("tr>td.gxsj").select("span").first();
			Job job = new Job();
			String url1=link.attr("href").toString();
			Document doc1 = Jsoup.connect(url1).get();
			Element element1 =  doc1.select("ul#terminal-ul clearfix").first();
			
//			System.out.println(element1+"===");
			link.text();
			i++;
			
			job.setPosition(link.text().toString());
			job.setCompany(link1.text().toString());
			job.setCompensation(link2.text().toString());
			job.setWorkplace(link3.text().toString());
			job.setDate(link4.text().toString());
			hMap.put(i, job);
		}
		
		Set keys = hMap.keySet();
		
		for(Integer key:keys){
			Job value = hMap.get(key);
			Connection conn = null;
			try {
				PreparedStatement ps = null;
				conn = datautils.getConnection();
				String sql = "insert into job(position,company,compensation,workplace,date)values(?,?,?,?,?)";
				ps = conn.prepareStatement(sql);
				ps.setString(1, value.getPosition());
				ps.setString(2, value.getCompany());
				ps.setString(3, value.getCompensation());
				ps.setString(4, value.getWorkplace());
				ps.setString(5, value.getDate());
				ps.executeUpdate();
				conn.close();
			} catch (SQLException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				System.out.println("数据库访问失败");
			}
			System.out.println(key+","+value.toString());
		}
	}
	public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException {
		body1();
	}
}

要访问数据库,所以要用一个数据库个工具类:

package utils;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;


public class datautils {
	 public static final String Driver = "com.mysql.jdbc.Driver";  
	    public static final String url = "jdbc:mysql://localhost:3306/nyb?useSSL=true";  
	    public static final String user = "root";  
	    public static final String password = "123456";  
	  
	    static {  
	        try {  
	            Class.forName(Driver);  
	        } catch (ClassNotFoundException e) {  
	            e.printStackTrace();  
	        }  
	    }  
	    public static Connection getConnection() throws SQLException{  
	        Connection conn = null;  
	        conn = DriverManager.getConnection(url,user,password);  
	        return conn;  
	    }  
}



你可能感兴趣的:(java基础)