java+selenium爬取知网数据

使用selenium工具爬取知网相关数据,思路:根据几个关键词搜索出相关的内容,然后爬取列表中所有论文的访问链接。
注意:直接爬取的链接是不能用的,需要自己拼接一下。具体看代码。新手,代码写的有点乱。勿喷。里面穿插了一些简单的对于关键词的分析。不喜勿喷,谢谢。

直接上代码

package com.test.demo.controller;

import org.apache.xmlbeans.impl.xb.xsdschema.All;
import org.checkerframework.checker.nullness.compatqual.NullableDecl;
import org.json.JSONObject;
import org.openqa.selenium.By;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.stereotype.Controller;
import org.springframework.util.ResourceUtils;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;

import java.io.*;
import java.util.*;
import java.util.concurrent.TimeUnit;


/**
 * 这个整体的过程是 1:爬取所有的论文详情连接 存进文本中
 *                  2:根据文本中的详情链接,分别开始采相应的数据
 *                  3:采到的相应的数据在做分析,将结果写进KeyWord.json文件中
 */
@Controller
@RequestMapping("/CNKISpidr")
public class CNKISpiderController {
    /**
     * 抓取所有的详情列表
     * @throws Exception
     */
    @RequestMapping(value = "/spider", method = RequestMethod.POST)
    @ResponseBody
    public Integer spiderCNKI(String themeKey,String Key,String abstractKey,String pageNum) throws Exception{

        String themeStr =  themeKey;
        String keyStr = Key;
        String abstractKeyStr = abstractKey;

        // 创建一个list存放所有的详情的拼接成功的链接
        List reUrlList = new ArrayList<>();
        // 存放出版年月的list
        List yearList = new ArrayList<>();

       /* //  设置驱动的位置
        System.setProperty("webdriver.chrome.driver",
                "D:\\Google\\Chrome\\Application\\chromedriver.exe");*/

        //WebDriver driver;
       // driver=new ChromeDriver();
        //调整高度

       // ((ChromeDriver) driver).executeScript("window.scrollTo(0, document.body.scrollHeight);");

        //  爬取详情链接
       // List yearListRe = new ArrayList<>();
        initqueryAndGetData(themeStr,keyStr,abstractKeyStr);  //

       // Thread.sleep(10000);
        // 根据详情链接爬取内容
        spiderByUrl();
        //  分析内容,然后将结果写进json文件中
       // Thread.sleep(10000);
        writeJson();
        return 1;
    }

    // 将结果写进json文件中
    private void writeJson() throws Exception{

        // 创建存放数据的文本文件,以及读写数据的buffer
        File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
        if (!path1.exists()) path1 = new File("");
        // System.out.println("path:"+path1.getAbsolutePath());
        File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
        if (!upload.exists()) upload.mkdirs();
        //  1:创建一个文本文件,用来存放爬取的数据
        String path = upload.getAbsolutePath() + "\\keyWord.json";
        File file = new File(path);
        if (!file.exists()) {
            file.getParentFile().mkdirs();
        }
        file.createNewFile();
        // 写所有信息的  true是追加,false是覆盖
        FileWriter fileWriter = new FileWriter(file, false);
        BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);

        //*********读取关键词的文本文件**********************************************************************************
        File path2 = new File(ResourceUtils.getURL("classpath:").getPath());
        if (!path2.exists()) path2 = new File("");
        // System.out.println("path:"+path1.getAbsolutePath());
        File upload2 = new File(path2.getAbsolutePath(), "src/main/webapp/data");
        if (!upload2.exists()) upload2.mkdirs();
        String pathKey = upload2.getAbsolutePath() + "\\text.txt";   // 因为是要分别获取每一篇的关键词,所以直接分析这个文本
        // 路径文件
        File filekey = new File(pathKey);

        BufferedReader in = new BufferedReader(new FileReader(filekey));
        String line = null;
        //定义一个空字符串来接受读到的字符串
        String str = "";
        //  这个字符数组的长度不能写死
        //String array[] = new String[1605];
        List array = new ArrayList<>();
        //int i = 0;
        //循环把读取到的字符赋给str
        while ((line = in.readLine()) != null) {
            if (line.contains("关键词")) {
                str = line.substring(line.indexOf(":") + 1);
                //array[i] = str;
                array.add(str);
                //i++;
            }
        }
        System.out.println("array:"+array.size());
        //  这个是存放所有关键字节点的  nodes
        List> nodesList = new ArrayList>();    // list是map类型的

        //  这个是存放节点之间的关系的  links
        List> nodesLinkList = new ArrayList>();

        // 存放所有的关键词
        List nodeIdList = new ArrayList();


        //
        Map countKeysMap = keysWordCount();
        // 遍历   这里存放的是所有关键词的频次
       /* for (String key:countKeysMap.keySet()){
            System.out.println(key+"  "+countKeysMap.get(key));
        }*/

       int count = 0;
        //  点不录入重复的
        for (int j = 0; j < array.size(); j++) {   //遍历的是每一篇关键词串
            String re[] = array.get(j).split(";");
            count = count + re.length;
            for (int m = 0; m < re.length; m++) {    //   一篇论文中的关键词   有多个关键词
                //  结点
                //  这个map相当于存的是一篇论文中的关键词,以map的形式存储
                //  节点还应该具备的属性有 节点的大小 symbolSize
                // itemStyle: {
                //        color: 'red'
                //    }
                // 和节点的颜色

                for (String key : countKeysMap.keySet()) {
                    //  关键词
                    //String keyStr = re[m];
                    if ((key.trim()).equals(re[m].trim())) {
                        //  频次
                        Integer valueInt = countKeysMap.get(key);
                        if (valueInt == 1) {
                            //String colorStr = "{color:\"black\"}";
                            Map colorMap = new HashMap();
                            colorMap.put("color","#c23531");
                            Map nodesMap = new HashMap();
                            nodesMap.put("name", re[m].trim());
                            //nodesMap.put("value",1);
                            nodesMap.put("symbolSize", 3);
                            nodesMap.put("itemStyle",colorMap);
                            nodesList.add(nodesMap);
                        } else if (valueInt > 1 && valueInt <= 50) {
                            Map colorMap = new HashMap();
                            colorMap.put("color","#2f4554");
                            Map nodesMap = new HashMap();
                            nodesMap.put("name", re[m].trim());
                            //nodesMap.put("value",1);
                            nodesMap.put("symbolSize", 8);
                            nodesMap.put("itemStyle",colorMap);
                            nodesList.add(nodesMap);
                        } else if (valueInt > 50 && valueInt <= 100) {
                            Map colorMap = new HashMap();
                            colorMap.put("color","#61a0a8");
                            Map nodesMap = new HashMap();
                            nodesMap.put("name", re[m].trim());
                            //nodesMap.put("value",1);
                            nodesMap.put("symbolSize", 30);
                            nodesMap.put("itemStyle",colorMap);
                            nodesList.add(nodesMap);
                        } else if (valueInt > 100 && valueInt <= 300) {
                            Map colorMap = new HashMap();
                            colorMap.put("color","#d48265");
                            Map nodesMap = new HashMap();
                            nodesMap.put("name", re[m].trim());
                            //nodesMap.put("value",1);
                            nodesMap.put("symbolSize", 40);
                            nodesMap.put("itemStyle",colorMap);
                            nodesList.add(nodesMap);
                        } else if (valueInt > 300 && valueInt <= 500) {
                            Map colorMap = new HashMap();
                            colorMap.put("color","#91c7ae");
                            Map nodesMap = new HashMap();
                            nodesMap.put("name", re[m].trim());
                            //nodesMap.put("value",1);
                            nodesMap.put("symbolSize", 50);
                            nodesMap.put("itemStyle",colorMap);
                            nodesList.add(nodesMap);
                        } else {
                            Map colorMap = new HashMap();
                            colorMap.put("color","#749f83");
                            Map nodesMap = new HashMap();
                            nodesMap.put("name", re[m].trim());
                            //nodesMap.put("value",1);
                            nodesMap.put("symbolSize", 60);
                            nodesMap.put("itemStyle",colorMap);
                            nodesList.add(nodesMap);
                        }
                    }
                }
            }
        }
        //  对关键词去重,针对重复的元素,只存一次
        Set> keysSet = new HashSet>();
        keysSet.addAll(nodesList);

        System.out.println("count :"+count);
        System.out.println("所有节点数:"+nodesList.size());
        System.out.println("去重之后的节点数:"+keysSet.size());

        //  关系照常录入
        for (int j = 0; j < array.size(); j++) {
            String re[] = array.get(j).split(";");
            for (int m = 0; m < re.length - 1; m++) {    //   一篇论文中的关键词   有三个关键词
                //  关系
                Map linksMap = new HashMap();// 第一次 3个   第二次  3个
                // linksMap.put("source",count); //0   0   0 完   3  3  3
                // linksMap.put("target",count+m);  //0   1  2 完  3  4  5
                linksMap.put("source", re[m].trim()); //0   0   0 完   3  3  3
                linksMap.put("target", re[m + 1].trim());  //0   1  2 完  3  4  5
                nodesLinkList.add(linksMap);
            }
        }

        System.out.println("边数:"+nodesLinkList.size());

        String jsonObject1 = JSONObject.valueToString(JSONObject.wrap(keysSet));
        String jsonObject2 = JSONObject.valueToString(JSONObject.wrap(nodesLinkList));

        bufferedWriter.write("{\"nodes\":" + jsonObject1 + ",\"links\":" + jsonObject2 + "}");


        bufferedWriter.flush();

        // 关闭
        bufferedWriter.close();
        fileWriter.close();
    }


    private Map keysWordCount() throws Exception {
        File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
        if (!path1.exists()) path1 = new File("");
        // System.out.println("path:"+path1.getAbsolutePath());
        File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
        if (!upload.exists()) upload.mkdirs();
        String pathKey = upload.getAbsolutePath() + "\\textKey.txt";
        // 路径文件
        File filekey = new File(pathKey);
        FileReader fileReader = new FileReader(filekey);
        BufferedReader bufferedReader = new BufferedReader(fileReader);
        // 统计词频
        String file = "";
        String s;
        while ((s = bufferedReader.readLine()) != null) {
            file += s;   //  读取整篇文章,存入String类的file中
        }
        // System.out.println(file);

        StringTokenizer st = new StringTokenizer(file, ";");  //用于切分字符串

        Map hm = new HashMap();

        while (st.hasMoreTokens()) {
            String word = st.nextToken();
            if (hm.get(word) != null) {
                int value = ((Integer) hm.get(word)).intValue();
                value++;
                hm.put(word, new Integer(value));
            } else {
                hm.put(word, new Integer(1));
            }
        }
        //1、按顺序保存map中的元素,使用LinkedList类型
        List> keyList = new LinkedList>(hm.entrySet());
        //System.out.println("一共有  "+keyList.size()+" 个关键词");
        //2、按照自定义的规则排序
        Collections.sort(keyList, new Comparator>() {
            @Override
            public int compare(Map.Entry o1,
                               Map.Entry o2) {
                if (o2.getValue().compareTo(o1.getValue()) > 0) {
                    return 1;
                } else if (o2.getValue().compareTo(o1.getValue()) < 0) {
                    return -1;
                } else {
                    return 0;
                }
            }

        });
        //3、将LinkedList按照排序好的结果,存入到HashMap中
        HashMap result = new LinkedHashMap<>();
        for (Map.Entry entry : keyList) {
            result.put(entry.getKey(), entry.getValue());
        }
        Map resultMap = new HashMap<>();
        //  将频次大于等于2的取出来
        for (String key : result.keySet()) {
            if (hm.get(key) >= 2) {
                resultMap.put(key, result.get(key));
            }
        }
        return result;
    }

    /**
     * 爬取详情链接
     * 爬取链接的时候,要 1:存详情链接,以便后面爬取内容  2:存储出版年月,以便后面一一对应,写进文本中去。
     * @param themeStr
     * @param keyStr
     * @param abstractKeyStr
     * @throws Exception
     */
    private void initqueryAndGetData(String themeStr, String keyStr, String abstractKeyStr) throws Exception {

        //  设置驱动的位置
        System.setProperty("webdriver.chrome.driver",
                "D:\\Google\\Chrome\\Application\\chromedriver.exe");
         WebDriver driver;
         driver=new ChromeDriver();
        ((ChromeDriver) driver).executeScript("window.scrollTo(0, document.body.scrollHeight);");
        // 创建存放数据的文本文件,以及读写数据的buffer
        File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
        if (!path1.exists()) path1 = new File("");
        // System.out.println("path:"+path1.getAbsolutePath());
        File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
        if (!upload.exists()) upload.mkdirs();

        // 3:创建一个文本文件存放所有的详情url
        String pathDetail = upload.getAbsolutePath() + "\\detailUrl.txt";   //  这里最终是detailUrl.txt
        File fileDetail = new File(pathDetail);
        if (!fileDetail.exists()) {
            fileDetail.getParentFile().mkdirs();
        }
        fileDetail.createNewFile();

        //  4:创建一个文本专门存年月日的
        // 3:创建一个文本文件存放所有的详情url
        String yearText = upload.getAbsolutePath() + "\\yearText.txt";   //  这里最终是detailUrl.txt
        File YearTextFile = new File(yearText);
        if (!YearTextFile.exists()) {
            YearTextFile.getParentFile().mkdirs();
        }
        YearTextFile.createNewFile();

        //  写url的
        FileWriter fileWriterDetail = new FileWriter(fileDetail, false);
        BufferedWriter bufferedWriterDetail = new BufferedWriter(fileWriterDetail);

        //  写url的
        FileWriter fileWriterYear = new FileWriter(YearTextFile, false);
        BufferedWriter bufferedWriterYear = new BufferedWriter(fileWriterYear);

        // 存放出版年月的list
        List yearList = new ArrayList<>();

        List reUrlList = new ArrayList<>();
        // 定义入口网址       http://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD
        String websiteUel = "http://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD";
        driver.get(websiteUel);
        //  题目搜索
        WebElement theme = driver.findElement(By.name("txt_1_value1"));//driver.findElementByName("txt_1_value1");
        //theme.sendKeys("淤地坝");  //主题
        theme.sendKeys(themeStr);
        //  关键字搜索
        WebElement keyWordChoice = driver.findElement(By.xpath("//*[@id=\"txt_2_logical\"]/option[2]"));
        keyWordChoice.click();
        WebElement keyWord = driver.findElement(By.name("txt_2_value1"));//driver.findElementByName("txt_2_value1");
        //keyWord.sendKeys("淤地坝");    // 关键字
        keyWord.sendKeys(keyStr);    // 关键字
        // 点击+
        WebElement add = driver.findElement(By.xpath("//*[@id=\"txt_1\"]/td[1]/a[1]"));
        add.click();
        //  摘要搜索
        WebElement zhaiYaoChioce = driver.findElement(By.xpath("//*[@id=\"txt_3_sel\"]/option[4]"));
        zhaiYaoChioce.click();
        WebElement zhaiyaoHuo = driver.findElement(By.xpath("//*[@id=\"txt_3_logical\"]/option[2]"));
        zhaiyaoHuo.click();
        WebElement zhaiYaoValue = driver.findElement(By.name("txt_3_value1"));//driver.findElementByName("txt_3_value1");
      //  zhaiYaoValue.sendKeys("淤地坝");    // 摘要
        zhaiYaoValue.sendKeys(abstractKeyStr);    // 摘要
        zhaiYaoValue.sendKeys(Keys.ENTER);
        // Thread.sleep(500);
        // 定位iframe
        WebElement iframe = driver.findElement(By.id("iframeResult"));
        ((ChromeDriver) driver).switchTo().frame(iframe);
        //  //*[@id="id_grid_display_num"]/a[3]
        Thread.sleep(2000);
        WebElement fiveB = driver.findElement(By.id("id_grid_display_num"));
       // System.out.printf(fiveB.getText());
        List fiveBtn = fiveB.findElements(By.tagName("a"));
        fiveBtn.get(2).click();


        // 获取总页数
        WebElement allPageNumEle = driver.findElement(By.xpath("//*[@id=\"J_ORDER\"]/tbody/tr[2]/td/table/tbody/tr/td[2]/div/span[1]"));
        String allPageNumStr = allPageNumEle.getText();
        allPageNumStr = allPageNumStr.substring(allPageNumStr.indexOf("/")+1,allPageNumStr.length());
        Integer AllPage = Integer.parseInt(allPageNumStr);
        System.out.println(AllPage);
        //  做测试  只获取两页数据
        for (int i = 0; i < AllPage-1; i++) {
            //获取窗口
            String now_handle = driver.getWindowHandle();
            Set all_handles = driver.getWindowHandles();
            //判断窗口是否一致
            for (String handle : all_handles) {
                if (handle != now_handle) {
                    driver.switchTo().window(handle);
                    ((ChromeDriver) driver).switchTo().frame(iframe);
                    //获取iframe元素内容直至tr
                    List tb = driver.findElements(By.xpath("//*[@id=\"ctl00\"]/table/tbody/tr[2]"));
                    for (WebElement t : tb) {
                        List tbod = t.findElements(By.tagName("tbody"));
                        for (WebElement tr : tbod) {
                            List trA = tr.findElements(By.tagName("tr")); //获取所有的tr
                            trA.remove(0);
                            // for (WebElement tds : td) {  //  遍历td  a标签在td[1]里面
                            for (int j = 0; j < trA.size(); j++) {  //遍历tr

                                // td里面包含一行的数据
                                List tdss = trA.get(j).findElements(By.tagName("td"));   //获取所有的td  而我们只需要前几个td
                                // 标题
                                String title = tdss.get(1).getText();
                                WebElement url = driver.findElement(By.xpath("//*[@id=\"ctl00\"]/table/tbody/tr[2]/td/table/tbody/tr[" + (j + 2) + "]/td[2]/a"));
                                String urlStr = url.getAttribute("href");
                                //System.out.println(urlStr);
                                String reUrl1 = urlStr.replace("kns/", "KCMS/");
                                // 将详情的 url写进文本文件中
                                bufferedWriterDetail.write(reUrl1+"\n");
                                reUrlList.add(reUrl1);

                                // 这里是取出出版年月  有多少篇论文就有多少个出版年月
                                String year = tdss.get(4).getText();
                                System.out.println(year);
                                bufferedWriterYear.write(year+"\n");
                                bufferedWriterYear.flush();
                                //yearList.add(year);
                            }
                        }
                    }
                }
            }

            WebElement nextBtn = driver.findElement(By.id("Page_next"));

            try {
                if (i == 0){   //   14的倍数
                   // System.out.println(i+"  不是");
                    //Thread.sleep(90000);
                  //  WebElement nextBtn = driver.findElement(By.id("Page_next"));
                    nextBtn.click();
                }else if (i % 14 == 0 ){
                   // System.out.println(i+"  是");
                    Thread.sleep(90000);
                    nextBtn.click();
                }else {
                  //  System.out.println(i+"  不是");
                    // Thread.sleep(90000);
                    // WebElement nextBtn = driver.findElement(By.id("Page_next"));
                    nextBtn.click();
                }

            }catch (NoSuchElementException e){
                System.out.println("element is noe exist");
            }
        }

        bufferedWriterDetail.flush();

        fileWriterDetail.close();
        bufferedWriterDetail.close();

        bufferedWriterYear.close();
        fileWriterYear.close();


       // return yearList;
    }

    /**
     * 通过详情url来爬取内容
     * 注意:读取两个文本文件(存放详情链接的,存放出版年月的)
     *      存储三个文本文件(所有的内容文本,只有关键词的文本,关键词语出版年月一一对应的文本)
     * @throws Exception
     * @param
     * @param
     */
   /* @RequestMapping("/spiderByUrl")
    @ResponseBody*/
    public void spiderByUrl() throws Exception{

        //  声明谷歌浏览器
        System.setProperty("webdriver.chrome.driver",
                "D:\\Google\\Chrome\\Application\\chromedriver.exe");
        WebDriver chromdriver;
        chromdriver = new ChromeDriver();
        /////////////////////////////////////////////////////////////////////////////////////
        //声明火狐浏览器
        WebDriver firedriver;
        //火狐的安装位置
        System.setProperty("webdriver.firefox.bin","D:\\fireBrower\\firefox.exe");
        //加载驱动
        System.setProperty("webdriver.firefox.marionette","D:\\fireBrower\\geckodriver.exe");
        firedriver=new FirefoxDriver();
        //firedriver.get("http://sms.webchinese.cn/api.shtml");

        ///////////////////////////////////////////////////////////////////////////////////
        File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
        if (!path1.exists()) path1 = new File("");
        // System.out.println("path:"+path1.getAbsolutePath());
        File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
        if (!upload.exists()) upload.mkdirs();
        String pathKey = upload.getAbsolutePath() + "\\detailUrl.txt";   //  这里最终是  detailUrl.txt
        // 路径文件
        File filekey = new File(pathKey);

        BufferedReader in = new BufferedReader(new FileReader(filekey));

        String pathYearText = upload.getAbsolutePath() + "\\yearText.txt";
        File fileYearText = new File(pathYearText);
        BufferedReader inYear = new BufferedReader(new FileReader(fileYearText));
        List yearListRe = new ArrayList<>();
        String lineYear = null;
        while ((lineYear = inYear.readLine()) != null){
            yearListRe.add(lineYear);
        }

        String line = null;
        int i = 0;
        List reUrlList = new ArrayList<>();
        //循环把读取到的字符赋给str
        while ((line = in.readLine()) != null) {
            //System.out.println(line);
            reUrlList.add(line);   //读取详情列表中的每一行
        }
        //
        String pathText = upload.getAbsolutePath() + "\\text.txt";   //  最终是 text.txt
        File fileText = new File(pathText);
        FileWriter fileWriter = new FileWriter(fileText, false);
        BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);

        //
        String pathKeyTest = upload.getAbsolutePath() + "\\textKey.txt";    //    最终是 textKey.txt
        File fileKeyTest = new File(pathKeyTest);
        FileWriter fileWriterKey = new FileWriter(fileKeyTest, false);
        BufferedWriter bufferedWriterKey = new BufferedWriter(fileWriterKey);


        String pathKeyTest2 = upload.getAbsolutePath() + "\\textKeyAndYear.txt";    //    最终是 textKey.txt
        File fileKeyTest2 = new File(pathKeyTest2);
        FileWriter fileWriterKey2 = new FileWriter(fileKeyTest2, false);
        BufferedWriter bufferedWriterKey2 = new BufferedWriter(fileWriterKey2);

        int CountNumber = 1;

        for (int j = 0; j < reUrlList.size(); j++) {
            if ( j % 2 == 0  ){   // 偶数
                if (j == 0){
                    Thread.sleep(1000);
                    chromdriver.get(reUrlList.get(j));
                   // chromdriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
                    // System.out.println(chromdriver.getWindowHandle());
                    //Thread.sleep(500);
                    //chromdriver.navigate().refresh();
                    System.out.println(CountNumber++);
                    String TitleStr = "";
                    String AuthorStr = "";
                    String DepartStr = "";
                    String ZhaiyaoStr = "";
                    String GuanjianciStr = "";
                    String baseMoneyStr = "";
                    String classNumStr = "";
                    String GuanjianCi = "";
                    String shaiXuanKeyWord = "";

                    //  数据的爬取  开始
                    //  获取标题【肯定有】 //*[@id="mainArea"]/div[3]/div[1]/h2  //*[@id="mainArea"]/div[2]/div[1]/h2  //*[@id="mainArea"]
                    WebElement eleTitleMain = chromdriver.findElement(By.id("mainArea"));
                    WebElement eleTitleCon = eleTitleMain.findElement(By.className("wxmain"));
                    WebElement eleTitleTitle = eleTitleCon.findElement(By.className("wxTitle"));
                    WebElement eleTitleText = eleTitleTitle.findElement(By.tagName("h2"));
                    // 标题
                    TitleStr = eleTitleText.getText();

                    // 作者【可能没有,判空】  //*[@id="mainArea"]/div[3]/div[1]/div[1]   //*[@id="mainArea"]/div[3]/div[1]/div[1]/span[1]
                    WebElement eleAuthor = chromdriver.findElement(By.id("mainArea"));
                    WebElement eleAuthorCon = eleAuthor.findElement(By.className("wxmain"));
                    WebElement eleAuthorTitle = eleAuthorCon.findElement(By.className("wxTitle"));
                    WebElement eleAuthorDiv = eleAuthorTitle.findElement(By.className("author"));
                    List authors = eleAuthorDiv.findElements(By.tagName("span"));
                    // 作者
                    if (authors.size() != 0){
                        for (int q = 1; q < authors.size(); q++) {
                            AuthorStr += authors.get(q).getText() + " ";
                        }
                    }else {
                        AuthorStr += "为空";
                    }

                    // 单位【可能没有,判空】   //*[@id="mainArea"]/div[3]/div[1]/div[2]
                    WebElement eleDepart = chromdriver.findElement(By.id("mainArea"));
                    WebElement eleDepartCon = eleDepart.findElement(By.className("wxmain"));
                    WebElement eleDepartTitle = eleDepartCon.findElement(By.className("wxTitle"));
                    WebElement eleDepartDiv = eleDepartTitle.findElement(By.className("orgn"));
                    List eleDeparts = eleDepartDiv.findElements(By.tagName("span"));
                    if (eleDeparts.size() != 0){
                        for (int e = 1; e < eleDeparts.size(); e++) {
                            DepartStr += eleDeparts.get(e).getText() + " ";
                        }
                    }else {
                        DepartStr += "为空";
                    }

                    // 摘要【可能没有,判空】    //*[@id="mainArea"]/div[2]/div[2]/div[1]/p[1]   这里取所有
                    String mainContent = "";
                    WebElement eleZhaiyao = chromdriver.findElement(By.id("mainArea"));
                    WebElement eleZhaiyaoCon = eleZhaiyao.findElement(By.className("wxmain"));
                    WebElement eleZhaiyaoTitle = eleZhaiyaoCon.findElement(By.className("wxInfo"));
                    WebElement eleZhaiyaoDiv = eleZhaiyaoTitle.findElement(By.className("wxBaseinfo"));
                    List spanEle = eleZhaiyaoDiv.findElements(By.tagName("p"));

                    if (spanEle.size() != 0){
                        for (int r = 0; r < spanEle.size(); r++) {
                            if (spanEle.get(r).getText().equals("")) {

                            } else {
                                if (spanEle.get(r).getText().contains("手机阅读本文") || spanEle.get(r).getText().contains("下载安装手机APP") ||
                                        spanEle.get(r).getText().contains("扫码同步阅读本文") || spanEle.get(r).getText().contains("文内图片") ||
                                        spanEle.get(r).getText().contains("图1") || spanEle.get(r).getText().contains("图 2") ||
                                        spanEle.get(r).getText().contains("图3") || spanEle.get(r).getText().contains("图4") ||
                                        spanEle.get(r).getText().contains("图5") || spanEle.get(r).getText().contains("图6") ||
                                        spanEle.get(r).getText().contains("图 7") || spanEle.get(r).getText().contains("图 8") ||
                                        spanEle.get(r).getText().contains("图 9") || spanEle.get(r).getText().contains("图10")) {

                                } else {
                                    shaiXuanKeyWord = spanEle.get(r).getText();
                                    if (shaiXuanKeyWord.contains("关键词")) {
                                        //  存储含有关键字的论文和出版年月
                                        bufferedWriterKey2.write(shaiXuanKeyWord+"\n");
                                        bufferedWriterKey2.write(yearListRe.get(j)+"\n");
                                        bufferedWriterKey2.flush();
                                        GuanjianciStr += shaiXuanKeyWord;
                                    }
                                    mainContent += shaiXuanKeyWord + "\n";
                                }
                            }
                        }
                    }else {
                        GuanjianciStr += "为空";
                        mainContent += "为空";
                    }
                    //  截取关键字中的冒号后面的内容
                    String GuanjianciStrRe = GuanjianciStr.substring(GuanjianciStr.indexOf(":") + 1);
                    GuanjianCi = GuanjianciStrRe.replace(" ", "");
                    //  抓取论文的下载链接
                    String paperHref = "";
                    WebElement downEle = chromdriver.findElement(By.id("DownLoadParts"));
                    List downList = downEle.findElements(By.tagName("a"));
                    for (int k=0;k authors = eleAuthorDiv.findElements(By.tagName("span"));
                    // 作者
                    if (authors.size() != 0){
                        for (int q = 1; q < authors.size(); q++) {
                            AuthorStr += authors.get(q).getText() + " ";
                        }
                    }else {
                        AuthorStr += "为空";
                    }

                    // 单位【可能没有,判空】   //*[@id="mainArea"]/div[3]/div[1]/div[2]
                    WebElement eleDepart = chromdriver.findElement(By.id("mainArea"));
                    WebElement eleDepartCon = eleDepart.findElement(By.className("wxmain"));
                    WebElement eleDepartTitle = eleDepartCon.findElement(By.className("wxTitle"));
                    WebElement eleDepartDiv = eleDepartTitle.findElement(By.className("orgn"));
                    List eleDeparts = eleDepartDiv.findElements(By.tagName("span"));
                    if (eleDeparts.size() != 0){
                        for (int e = 1; e < eleDeparts.size(); e++) {
                            DepartStr += eleDeparts.get(e).getText() + " ";
                        }
                    }else {
                        DepartStr += "为空";
                    }

                    // 摘要【可能没有,判空】    //*[@id="mainArea"]/div[2]/div[2]/div[1]/p[1]   这里取所有
                    String mainContent = "";
                    WebElement eleZhaiyao = chromdriver.findElement(By.id("mainArea"));
                    WebElement eleZhaiyaoCon = eleZhaiyao.findElement(By.className("wxmain"));
                    WebElement eleZhaiyaoTitle = eleZhaiyaoCon.findElement(By.className("wxInfo"));
                    WebElement eleZhaiyaoDiv = eleZhaiyaoTitle.findElement(By.className("wxBaseinfo"));
                    List spanEle = eleZhaiyaoDiv.findElements(By.tagName("p"));

                    if (spanEle.size() != 0){
                        for (int r = 0; r < spanEle.size(); r++) {
                            if (spanEle.get(r).getText().equals("")) {

                            } else {
                                if (spanEle.get(r).getText().contains("手机阅读本文") || spanEle.get(r).getText().contains("下载安装手机APP") ||
                                        spanEle.get(r).getText().contains("扫码同步阅读本文") || spanEle.get(r).getText().contains("文内图片") ||
                                        spanEle.get(r).getText().contains("图1") || spanEle.get(r).getText().contains("图 2") ||
                                        spanEle.get(r).getText().contains("图3") || spanEle.get(r).getText().contains("图4") ||
                                        spanEle.get(r).getText().contains("图5") || spanEle.get(r).getText().contains("图6") ||
                                        spanEle.get(r).getText().contains("图 7") || spanEle.get(r).getText().contains("图 8") ||
                                        spanEle.get(r).getText().contains("图 9") || spanEle.get(r).getText().contains("图10")) {

                                } else {
                                    shaiXuanKeyWord = spanEle.get(r).getText();
                                    if (shaiXuanKeyWord.contains("关键词")) {
                                        //  存储含有关键字的论文和出版年月
                                        bufferedWriterKey2.write(shaiXuanKeyWord+"\n");
                                        bufferedWriterKey2.write(yearListRe.get(j)+"\n");
                                        bufferedWriterKey2.flush();
                                        GuanjianciStr += shaiXuanKeyWord;
                                    }
                                    mainContent += shaiXuanKeyWord + "\n";
                                }
                            }
                        }
                    }else {
                        GuanjianciStr += "为空";
                        mainContent += "为空";
                    }
                    //  截取关键字中的冒号后面的内容
                    String GuanjianciStrRe = GuanjianciStr.substring(GuanjianciStr.indexOf(":") + 1);
                    GuanjianCi = GuanjianciStrRe.replace(" ", "");
                    //  抓取论文的下载链接
                    String paperHref = "";
                    WebElement downEle = chromdriver.findElement(By.id("DownLoadParts"));
                    List downList = downEle.findElements(By.tagName("a"));
                    for (int k=0;k authors = eleAuthorDiv.findElements(By.tagName("span"));
                    // 作者
                    if (authors.size() != 0){
                        for (int q = 1; q < authors.size(); q++) {
                            AuthorStr += authors.get(q).getText() + " ";
                        }
                    }else {
                        AuthorStr += "为空";
                    }

                    // 单位【可能没有,判空】   //*[@id="mainArea"]/div[3]/div[1]/div[2]
                    WebElement eleDepart = chromdriver.findElement(By.id("mainArea"));
                    WebElement eleDepartCon = eleDepart.findElement(By.className("wxmain"));
                    WebElement eleDepartTitle = eleDepartCon.findElement(By.className("wxTitle"));
                    WebElement eleDepartDiv = eleDepartTitle.findElement(By.className("orgn"));
                    List eleDeparts = eleDepartDiv.findElements(By.tagName("span"));
                    if (eleDeparts.size() != 0){
                        for (int e = 1; e < eleDeparts.size(); e++) {
                            DepartStr += eleDeparts.get(e).getText() + " ";
                        }
                    }else {
                        DepartStr += "为空";
                    }

                    // 摘要【可能没有,判空】    //*[@id="mainArea"]/div[2]/div[2]/div[1]/p[1]   这里取所有
                    String mainContent = "";
                    WebElement eleZhaiyao = chromdriver.findElement(By.id("mainArea"));
                    WebElement eleZhaiyaoCon = eleZhaiyao.findElement(By.className("wxmain"));
                    WebElement eleZhaiyaoTitle = eleZhaiyaoCon.findElement(By.className("wxInfo"));
                    WebElement eleZhaiyaoDiv = eleZhaiyaoTitle.findElement(By.className("wxBaseinfo"));
                    List spanEle = eleZhaiyaoDiv.findElements(By.tagName("p"));

                    if (spanEle.size() != 0){
                        for (int r = 0; r < spanEle.size(); r++) {
                            if (spanEle.get(r).getText().equals("")) {

                            } else {
                                if (spanEle.get(r).getText().contains("手机阅读本文") || spanEle.get(r).getText().contains("下载安装手机APP") ||
                                        spanEle.get(r).getText().contains("扫码同步阅读本文") || spanEle.get(r).getText().contains("文内图片") ||
                                        spanEle.get(r).getText().contains("图1") || spanEle.get(r).getText().contains("图 2") ||
                                        spanEle.get(r).getText().contains("图3") || spanEle.get(r).getText().contains("图4") ||
                                        spanEle.get(r).getText().contains("图5") || spanEle.get(r).getText().contains("图6") ||
                                        spanEle.get(r).getText().contains("图 7") || spanEle.get(r).getText().contains("图 8") ||
                                        spanEle.get(r).getText().contains("图 9") || spanEle.get(r).getText().contains("图10")) {

                                } else {
                                    shaiXuanKeyWord = spanEle.get(r).getText();
                                    if (shaiXuanKeyWord.contains("关键词")) {
                                        //  存储含有关键字的论文和出版年月
                                        bufferedWriterKey2.write(shaiXuanKeyWord+"\n");
                                        bufferedWriterKey2.write(yearListRe.get(j)+"\n");
                                        bufferedWriterKey2.flush();
                                        GuanjianciStr += shaiXuanKeyWord;
                                    }
                                    mainContent += shaiXuanKeyWord + "\n";
                                }
                            }
                        }
                    }else {
                        GuanjianciStr += "为空";
                        mainContent += "为空";
                    }
                    //  截取关键字中的冒号后面的内容
                    String GuanjianciStrRe = GuanjianciStr.substring(GuanjianciStr.indexOf(":") + 1);
                    GuanjianCi = GuanjianciStrRe.replace(" ", "");
                    //  抓取论文的下载链接
                    String paperHref = "";
                    WebElement downEle = chromdriver.findElement(By.id("DownLoadParts"));
                    List downList = downEle.findElements(By.tagName("a"));
                    for (int k=0;k authors = eleAuthorDiv.findElements(By.tagName("span"));
                // 作者
                if (authors.size() != 0){
                    for (int q = 1; q < authors.size(); q++) {
                        AuthorStr += authors.get(q).getText() + " ";
                    }
                }else {
                    AuthorStr += "为空";
                }

                // 单位【可能没有,判空】   //*[@id="mainArea"]/div[3]/div[1]/div[2]
                // //*[@id="mainArea"]/div[3]/div[1]/div[2]   //*[@id="mainArea"]/div[3]/div[1]/div[2]
                // WebElement eleDepart = driver.findElement(By.xpath("//*[@id=\"mainArea\"]"));
                WebElement eleDepart = firedriver.findElement(By.id("mainArea"));
                WebElement eleDepartCon = eleDepart.findElement(By.className("wxmain"));
                WebElement eleDepartTitle = eleDepartCon.findElement(By.className("wxTitle"));
                WebElement eleDepartDiv = eleDepartTitle.findElement(By.className("orgn"));
                List eleDeparts = eleDepartDiv.findElements(By.tagName("span"));
                if (eleDeparts.size() != 0){
                    for (int e = 1; e < eleDeparts.size(); e++) {
                        DepartStr += eleDeparts.get(e).getText() + " ";
                    }
                }else {
                    DepartStr += "为空";
                }

                // 摘要【可能没有,判空】    //*[@id="mainArea"]/div[2]/div[2]/div[1]/p[1]   这里取所有
                // 用一个list来装这些信息
                //List mainContent = new ArrayList<>();
                String mainContent = "";
                //   WebElement eleZhaiyao = driver.findElement(By.xpath("//*[@id=\"mainArea\"]"));
                WebElement eleZhaiyao = firedriver.findElement(By.id("mainArea"));
                WebElement eleZhaiyaoCon = eleZhaiyao.findElement(By.className("wxmain"));
                WebElement eleZhaiyaoTitle = eleZhaiyaoCon.findElement(By.className("wxInfo"));
                WebElement eleZhaiyaoDiv = eleZhaiyaoTitle.findElement(By.className("wxBaseinfo"));
                List spanEle = eleZhaiyaoDiv.findElements(By.tagName("p"));

                if (spanEle.size() != 0){
                    //String str = "[0-9]{1}";
                    for (int r = 0; r < spanEle.size(); r++) {
                        //System.out.println(spanEle.get(r).getText());
                        if (spanEle.get(r).getText().equals("")) {

                        } else {
                            if (spanEle.get(r).getText().contains("手机阅读本文") || spanEle.get(r).getText().contains("下载安装手机APP") ||
                                    spanEle.get(r).getText().contains("扫码同步阅读本文") || spanEle.get(r).getText().contains("文内图片") ||
                                    spanEle.get(r).getText().contains("图1") || spanEle.get(r).getText().contains("图 2") ||
                                    spanEle.get(r).getText().contains("图3") || spanEle.get(r).getText().contains("图4") ||
                                    spanEle.get(r).getText().contains("图5") || spanEle.get(r).getText().contains("图6") ||
                                    spanEle.get(r).getText().contains("图 7") || spanEle.get(r).getText().contains("图 8") ||
                                    spanEle.get(r).getText().contains("图 9") || spanEle.get(r).getText().contains("图10")) {

                            } else {
                                shaiXuanKeyWord = spanEle.get(r).getText();
                                if (shaiXuanKeyWord.contains("关键词")) {
                                    //  存储含有关键字的论文和出版年月
                                    bufferedWriterKey2.write(shaiXuanKeyWord+"\n");
                                    bufferedWriterKey2.write(yearListRe.get(j)+"\n");
                                    bufferedWriterKey2.flush();
                                    GuanjianciStr += shaiXuanKeyWord;
                                }
                                mainContent += shaiXuanKeyWord + "\n";
                            }
                        }
                    }
                }else {
                    GuanjianciStr += "为空";
                    mainContent += "为空";
                }

                //  截取关键字中的冒号后面的内容
                String GuanjianciStrRe = GuanjianciStr.substring(GuanjianciStr.indexOf(":") + 1);
                GuanjianCi = GuanjianciStrRe.replace(" ", "");

                //  抓取论文的下载链接
                String paperHref = "";
                WebElement downEle = firedriver.findElement(By.id("DownLoadParts"));
                List downList = downEle.findElements(By.tagName("a"));
                for (int k=0;k> getKeyWord() throws IOException {

        File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
        if (!path1.exists()) path1 = new File("");
        // System.out.println("path:"+path1.getAbsolutePath());
        File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
        if (!upload.exists()) upload.mkdirs();
        String pathKey = upload.getAbsolutePath() + "\\textKey.txt";
        // 路径文件
        File filekey = new File(pathKey);
        FileReader fileReader = new FileReader(filekey);
        BufferedReader bufferedReader = new BufferedReader(fileReader);
        // 统计词频
        String file = "";
        String s;
        while ((s = bufferedReader.readLine()) != null) {
            file += s;
        }
        // System.out.println(file);

        StringTokenizer st = new StringTokenizer(file, ";");  //用于切分字符串

        Map hm = new HashMap();

        while (st.hasMoreTokens()) {
            String word = st.nextToken();
            if (hm.get(word) != null) {
                int value = ((Integer) hm.get(word)).intValue();
                value++;
                hm.put(word, new Integer(value));
            } else {
                hm.put(word, new Integer(1));
            }
        }

        //1、按顺序保存map中的元素,使用LinkedList类型
        List> keyList = new LinkedList>(hm.entrySet());
        // System.out.println("一共有  "+keyList.size()+" 个关键词");
        //2、按照自定义的规则排序
        Collections.sort(keyList, new Comparator>() {
            @Override
            public int compare(Map.Entry o1,
                               Map.Entry o2) {
                if (o2.getValue().compareTo(o1.getValue()) > 0) {
                    return 1;
                } else if (o2.getValue().compareTo(o1.getValue()) < 0) {
                    return -1;
                } else {
                    return 0;
                }
            }

        });
        //3、将LinkedList按照排序好的结果,存入到HashMap中
        HashMap result = new LinkedHashMap<>();
        for (Map.Entry entry : keyList) {
            result.put(entry.getKey(), entry.getValue());
        }

        List keylist =new ArrayList<>(result.keySet());
        List fivekeyList = new ArrayList<>();
        fivekeyList = keylist.subList(0,5);  //取前五个元素
        List valuesList = new ArrayList(result.values());
        List fivekeyValueList = new ArrayList<>();
        fivekeyValueList = valuesList.subList(0,5);
        //  将取到的前五个合并为map
        Map resultMap = new HashMap();
        for (int k=0;k<5;k++){
            resultMap.put(fivekeyList.get(k),fivekeyValueList.get(k));
        }
        //  遍历resultMap
        /*for (String key : resultMap.keySet()) {
            System.out.println(key + ""+resultMap.get(key));
        }*/

        Map> map = new HashMap>();
        map.put("resultMap", resultMap);
        //String jsonString = JSON.toJSONString(map);
        return map;

    }


    /**
     * 使用火狐浏览器
     */
    @RequestMapping("/fire")
    @ResponseBody
    public void fire(){
        WebDriver firedriver;
        //火狐的安装位置
        System.setProperty("webdriver.firefox.bin","D:\\fireBrower\\firefox.exe");
        //加载驱动
        System.setProperty("webdriver.firefox.marionette","D:\\fireBrower\\geckodriver.exe");
        firedriver=new FirefoxDriver();
        firedriver.get("http://sms.webchinese.cn/api.shtml");
    }
}


你可能感兴趣的:(java+selenium爬取知网数据)