Java简单爬虫

废话不多说,直接上代码

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Method;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class ThreeBody {
//工程用于爬取落霞小说网中《三体》1-3部
//小说网为静态页面,使用最基本的技术即可完成
	public static void main(String[] args) throws Exception {
	//添加爬取连接:三体1-3部链接有以下规律
	// https://www.luoxia.com/santi/santi-1/
	//...santi-2/
	//...santi-3/
		String mainUrl = "https://www.luoxia.com/santi/santi-1/";
		String directory = "E:/The Three-Body Problem-1";
		Document doc = getCon(mainUrl).method(Method.GET).execute().parse();
		Element ul = doc.select("ul").last();
		ArrayList<String> urls = getUrl(ul);
		for (String url : urls) {
			Connection con = getCon(url);
			String msg= write(con, url, directory);
			System.out.println(msg);
		}
	}

	/**
	 * 
	 * @parameter:Connection con,String url
	 * @return:String
	 * @description:将指定连接中文本写入txt文件
	 */
	//获取连接对象并创建以小说章节为标题的txt文件
	public static String write(Connection con, String url, String directory) {
		FileWriter fw = null;
		BufferedWriter bw = null;
		String title = null;
		try {
			Document doc = con.method(Method.GET).execute().parse();
			title = doc.select("h1#nr_title").text();
			File file = new File(directory + File.separator + title + ".txt");
			if (!file.exists()) {
				file.createNewFile();
			}
			fw = new FileWriter(file);
			bw = new BufferedWriter(fw);
			Elements p = doc.select("div#nr1").get(0).select("p");
			for (int i = 0; i < p.size(); i++) {
				String text = p.get(i).text();
				bw.write(text + "\r\n");
				bw.flush();
			}
		} catch (Exception e) {
			e.printStackTrace();
			return title + ":写入失败";
		} finally {
			try {
				if (bw != null) {
					bw.close();
				}
				if (fw != null) {
					fw.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return title + ":写入成功";
	}

	/**
	 * 
	 * @parameter:Element element
	 * @return:ArrayList
	 * @description:在指定标签内抽取url
	 */
	 //在获取url时,目标网页的源码
  • 标签除了在索引为10,21,32,43,54位置包含外,其他位置均包含标签,为了懒省劲,直接用索引值代替 //用文章章节做文件名时,title中如果包含"\/:*<>?|"显示非法 public static ArrayList<String> getUrl(Element element) { ArrayList<String> urls = new ArrayList<>(); try { Elements lis = element.select("li"); for (int i = 0; i < lis.size(); i++) { if (i == 10 || i == 21 || i == 32 || i == 43 || i == 54) { Elements b = lis.get(i).select("b"); String value = b.attr("onclick"); String url = value.substring(value.indexOf("https"), value.indexOf("')")); urls.add(url); continue; } String url = lis.get(i).select("a").attr("href"); urls.add(url); } } catch (Exception e) { e.printStackTrace(); return null; } return urls; } /** * * @parameter:String url * @return:Connection con * @description:获取连接 */ public static Connection getCon(String mainUrl) { Connection con = Jsoup.connect(mainUrl).timeout(10000); con.header("User-0 Agent", "Mozilla/5.(Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"); return con; } }
  • 你可能感兴趣的:(java技术)