爬虫程序


package com.jw;

import com.jw.excel.ExcelUtil;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.util.List;
import java.util.Random;

/**
 * TODO
 *
 * @author lijiwang6407001878
 * @date 2019/12/28 10:15
 */
public class ParseWord {

    private static String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
            "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
            "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"};

    public static void main(String[] args) throws Exception {
        String t = "B2";
        String path = "D:\\development\\English\\" + t + ".xlsx";
        String path2 = "D:\\development\\English\\" + t + ".xlsx";
        List<Word> list = ExcelUtil.readExcel2007(new FileInputStream(path), Word.class, false);

//        getWord(list);
//
//        Workbook workbook = ExcelUtil.createExcel2007(list, null, null, false);
//        FileOutputStream fos = new FileOutputStream(path2);
//        workbook.write(fos);
//
//        workbook.close();
//        fos.close();

        getAuto(list, t);
    }

    private static void getAuto(List<Word> list, String t) {
        String path = "D:\\development\\English\\" + t;

        Random r = new Random();
        int ra;
        for (Word w : list) {

            try {
                if (w.getPronUk() != null) {
                    File file = new File(path + "\\UK\\" + w.getWord() + ".mp3");
                    saveMp3(file, w.getWord(), 1);

                    ra = r.nextInt(100);
                    Thread.sleep(300 + ra);
                    System.out.println("ra=" + ra);
                }

                if (w.getPronUs() != null) {
                    File file = new File(path + "\\US\\" + w.getWord() + ".mp3");
                    saveMp3(file, w.getWord(), 2);
                    ra = r.nextInt(100);
                    Thread.sleep(300 + ra);
                    System.out.println("ra=" + ra);
                }

            } catch (Exception e) {
                System.out.println(e.getMessage() + " " + w.getWord());
            }

        }
    }

    private static void saveMp3(File mp3, String word, int type) throws IOException {
        Random r = new Random();
        int rand = r.nextInt(14);
        String url = "?audio=" + word + "&type=" + type;
        URL u = new URL(url);
        InetSocketAddress addr = new InetSocketAddress("xxxxx", 80);
        Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
        HttpURLConnection con = (HttpURLConnection) u.openConnection(proxy);
        // 此处必须伪造referer,否则会自动返回首页.分析后,与cookie无关
        con.setRequestProperty("User-Agent", ua[rand]);
        con.setRequestProperty("Accept-Encoding", "gzip");
        con.setRequestProperty("referer", url);
        con.setDoInput(true);
        con.setReadTimeout(1000 * 8);

        if (con.getResponseCode() == HttpURLConnection.HTTP_OK) {
            InputStream is = con.getInputStream();

            byte[] b = new byte[1024 * 5];
            int length;
            FileOutputStream os = new FileOutputStream(mp3);
            while ((length = is.read(b)) != -1) {
                os.write(b, 0, length);
            }
            os.flush();
            os.close();
            is.close();
        } else {
            System.out.println("服务器返回:" + con.getResponseCode() + " " + word);
        }
    }

    private static void getWord(List<Word> list) throws InterruptedException {
        for (Word word : list) {
            if (word.getPronUk() != null || word.getPronUs() != null) {
                continue;
            }
            String url = "/" + word.getWord().trim();

            Random r = new Random();

            int rand = r.nextInt(14);

            Document doc = null;
            try {
                Connection con = Jsoup.connect(url);
                doc = con.userAgent(ua[rand])
                        .proxy("proxysz.zte.com.cn", 80)
                        .header("referer", "xxxx")
                        .timeout(8000)
                        .ignoreContentType(true)
                        .ignoreHttpErrors(true)
                        .get();
            } catch (Exception e) {
                System.out.println(e.getMessage() + " " + word.getWord());
            }
            if (doc == null) {
                continue;
            }

            Element pron = doc.getElementById("yd-word-pron");
            if (pron != null) {
                String[] prons = pron.text().split("美");
                if (prons.length > 0) {
                    word.setLevel("B2");

                    String pronUk = prons[0];
                    if (pronUk != null) {
                        pronUk = pronUk.replace("英", "").trim();
                    }
                    word.setPronUk(pronUk);

                    if (prons.length > 1) {
                        String pronUs = prons[1];
                        if (pronUs != null) {
                            pronUs = pronUs.trim();
                        }
                        word.setPronUs(pronUs);
                    }

                }
            }

            Element meaning = doc.getElementById("yd-word-meaning");
            if (meaning != null) {
                Elements elements = meaning.select("li");
                int i = 0;
                for (Element e : elements) {
                    if (i == 0) {
                        word.setMeaning1(e.text());
                    } else if (i == 1) {
                        word.setMeaning2(e.text());
                    } else {
                        break;
                    }
                    i++;
                }
            }

            int ra = r.nextInt(200);
            Thread.sleep(300 + ra);
            System.out.println("ra=" + ra);
        }
    }
}


你可能感兴趣的:(java)