Java爬取广州大学计算机学院师资队伍信息

欢迎各位高三的同学报考厂大

使用Java工具Jsoup爬取广州大学计算机科学与网络工程学院师资队伍中的公开信息,并存储到csv文件中

案例分析

Java爬取广州大学计算机学院师资队伍信息_第1张图片

测试结果

测试截图1
测试截图2

爬取信息工具HTMLParseUtil类

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;

public class HTMLParseUtil {
     
    public static void main(String[] args) {
     
        try {
     
            Map<String,String>map=new HashMap<>();
            Map<String,TeacherInfo>Teachers=new HashMap<>();
            String base_url="http://jsj.gzhu.edu.cn";
            String url=base_url+"/szdw1/jsjkxywlgcxysz.htm";
            //请求解析师资队伍导航预览网页
            Document document = Jsoup.parse(new URL(url), 30000);
            Elements elements = document.getElementsByClass("mclb");
            int teacher_num=0;
            for(Element e :elements){
     
                Elements div = e.getElementsByTag("a");
                for(Element a:div){
     
                    String name = a.text();
                    String href = base_url+a.attr("href").substring(2);
                    if(name.length()<10){
     
                        map.put(name,href);
                        TeacherInfo teacherInfo = new TeacherInfo();
                        teacherInfo.setName(name);
                        Teachers.put(name,teacherInfo);
                        teacher_num++;
                    }
                }
            }
            //请求解析每个老师单独详情网页,获得对应老师的头像
            for(String name:map.keySet()){
     
                String href = map.get(name);
                Document doc = Jsoup.parse(new URL(href), 30000);
                Elements imgs = doc.getElementsByTag("img");
                for(Element img:imgs){
     
                    String src = img.attr("src");
                    if(src.startsWith("/__local/")){
     
                        Teachers.get(name).setImg_url(base_url+src);
                    }
                }
            }
            //请求解析每个老师单独详情网页,获得对应老师的信息
            System.out.println("请求解析网页获得对应老师的信息");
            for(String name:map.keySet()){
     
                String href = map.get(name);
                Document doc = Jsoup.parse(new URL(href), 30000);
                Elements elms1 = doc.select("[id=vsb_content]");
                Elements elms2 = elms1.select("p");
                TeacherInfo teacherInfo = Teachers.get(name);
                for(Element elm:elms2){
     
                    String text = elm.text();
                    if(text.length()>0){
     
                        if(text.startsWith("职称")||text.startsWith("职务"))teacherInfo.setTitle(text);
                        else if(text.startsWith("系、研究所")||text.startsWith("部门"))teacherInfo.setDepartment(text);
                        else if(text.startsWith("研究领域"))teacherInfo.setDomain(text);
                        else if(text.startsWith("讲授课程"))teacherInfo.setCourse(text);
                        else if(text.startsWith("电子邮箱")||text.startsWith("电子邮件"))teacherInfo.setEmail(text);
                        else if(text.startsWith("办公电话"))teacherInfo.setPhone(text);
                        else if(text.startsWith("个人主页"))teacherInfo.setHome(text);
                    }
                }
            }
            //把数据写入csv文件中
//            String fileName = HTMLParseUtil.class.getClassLoader().getResource("TeacherInfo.csv").getFile();
            String fileName = "TeacherInfo.csv";
            File file = new File(fileName);
            if(file.exists()){
     
                file.delete();
            }
            //name;img_url;title;department;domain;course;email;home;phone;
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(fileName));
            for(TeacherInfo teacher:Teachers.values()) {
     
                System.out.println(teacher);
                bufferedWriter.newLine();
                bufferedWriter.write(teacher.getName()+","+
                        teacher.getImg_url()+","+
                        teacher.getTitle()+","+
                        teacher.getDepartment()+","+
                        teacher.getDomain()+","+
                        teacher.getCourse()+","+
                        teacher.getEmail()+","+
                        teacher.getHome()+","+
                        teacher.getPhone()
                );
                bufferedWriter.flush();
            }
            //target/classes/TeacherInfo.csv
            System.out.println("一共有教师人数:"+teacher_num);
            System.out.println("输出已经写入到路径"+file.getAbsolutePath()+"中");
        } catch (Exception e) {
     
            e.printStackTrace();
        }
    }
}

抽象一个TeacherInfo 类用于存储教师个人信息

public class TeacherInfo {
     
    private String name;
    private String img_url;
    private String title;
    private String department;
    private String domain;
    private String course;
    private String email;
    private String home;
    private String phone;

    @Override
    public String toString() {
     
        return "" +
                "name='" + name + '\'' +
                ", img_url='" + img_url + '\'' +
                ", title='" + title + '\'' +
                ", department='" + department + '\'' +
                ", domain='" + domain + '\'' +
                ", course='" + course + '\'' +
                ", email='" + email + '\'' +
                ", home='" + home + '\'' +
                ", phone='" + phone + '\'' +
                "";
//        return "TeacherInfo{" +
//                "name='" + name + '\'' +
//                ", img_url='" + img_url + '\'' +
//                ", title='" + title + '\'' +
//                ", department='" + department + '\'' +
//                ", domain='" + domain + '\'' +
//                ", course='" + course + '\'' +
//                ", email='" + email + '\'' +
//                ", home='" + home + '\'' +
//                ", phone='" + phone + '\'' +
//                '}';
    }

    public TeacherInfo(){
     

    }

    public String getPhone() {
     
        return phone;
    }

    public void setPhone(String phone) {
     
        this.phone = phone;
    }
    public String getCourse() {
     
        return course;
    }
    public String getHome() {
     
        return home;
    }

    public void setHome(String home) {
     
        this.home = home;
    }

    public void setCourse(String course) {
     
        this.course = course;
    }

    public String getName() {
     
        return name;
    }

    public void setName(String name) {
     
        this.name = name;
    }

    public String getImg_url() {
     
        return img_url;
    }

    public void setImg_url(String img_url) {
     
        this.img_url = img_url;
    }

    public String getTitle() {
     
        return title;
    }

    public void setTitle(String title) {
     
        this.title = title;
    }

    public String getDepartment() {
     
        return department;
    }

    public void setDepartment(String department) {
     
        this.department = department;
    }

    public String getDomain() {
     
        return domain;
    }

    public void setDomain(String domain) {
     
        this.domain = domain;
    }

    public String getEmail() {
     
        return email;
    }

    public void setEmail(String email) {
     
        this.email = email;
    }
}

你可能感兴趣的:(JAVA,java,爬虫)