java 网络爬虫(以爬取豆瓣为例)

文章目录

  • 一、 设计思路
  • 二、 功能概述
  • 三、运行状态图
  • 四、代码设计
    • 1.设计一个类负责任意一个豆瓣电影详情网页的爬取。
    • 2.设计一个类爬取 [豆瓣电影 Top 250](https://movie.douban.com/top250?start=0)上的电影网站。
    • 3.设计一个UI界面,把爬取的信息显示出来。
    • 4.Selenium类爬取网页,可以实现全网搜索。

一、 设计思路

获取豆瓣榜单TOP250电影的网站,然后通过网站分析每个电影网页,提取与电影有关的电影网站、评论、海报、上映日期、电影简介等信息。然后将这些信息显示到UI界面上。并且支持输入一个电影的网站然后解析该网站。直接根据电影名称等搜索电影,把搜索到的信息显示到UI界面上。
整体代码链接


二、 功能概述

1. 自动获取指定数目的豆瓣榜单TOP250电影网站,并且自动对每个电影网站进行解析获得相应的电影名、海报、评论等信息。
2. 在UI界面搜索框中输入一个电影的网站即可解析该网站,并把解析的内容更新UI界面。
3. 下载搜索到的所有海报到工程根目录下的image文件夹,并自动分类。
4. 利用Selenium 插件自动控制Google Chrome 进行全网搜索,只需要输入不完整的电影名即可实现(要求必须有Google Chrome浏览器)。
豆瓣电影 Top 250


三、运行状态图

java 网络爬虫(以爬取豆瓣为例)_第1张图片

四、代码设计

1.设计一个类负责任意一个豆瓣电影详情网页的爬取。

主要获取的电影信息有:网站、中文名、英文名、评分等。。

package Courses.GrabData;

import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


//电影详情页面可解析信息如下:电影id,电影中文名,电影别名,电影豆瓣 评分,看过人数,想看人数,评分人数,5张海报图,条目分类,导演,主演,编 剧,官方网站,豆瓣小站,上映时间日期,年代,语言,电影时长,影片类型,


/**
 * 	爬取一个电影详情页面。支持多线程
 * @author Administrator
 *
 */
public class DianYingXiangQing implements Runnable{

	public String wangzhan = "";	//网站
	public String zhongwenname = "";//中文名
	public String yingwenname = "";//英文名
	public String daoyanname = "";	//导演*
	public String bianjuname = "";//编剧*
	public String zhuyanname = "";	//主演*
	public String shangyingriqi = "";	//上映日期
	public String pingjiarenshu = "";	//评价人数
	public String pingjiafenshu = "";	//评价分数
	public String leixing = "";	//类型*
	public String jianjie = "";	//简介
	public String haibaourl = "";	//海报网站
	public String duanpingurl = "";	//全部短评网站
	public int duanpingcount = 0;	//短评数量
	public ArrayList<String> list_haibao = new ArrayList<>();
	public ArrayList<String> list_duanping = new ArrayList<>();

	public boolean imagebaocun = false;		//是否保存图片为本地文件,如果保存的话读取海报时直接本地读取,不需要去网站下载,节约时间,但占较大的的磁盘空间
	public DianYingXiangQing(String wz){
		wangzhan = wz;
		//run();
	}
	@Override
	/**
	 * 程序运行开始爬取电影详情
	 */
	public void run(){
		String lie = "";
		Scanner input = null;
		Pattern r;
		Matcher m;
		try {
			URL url = new URL(wangzhan);
			input = new Scanner(url.openStream(),"UTF-8");
		}
		catch (java.net.MalformedURLException ex){
			System.out.println("Invalid URL"+ lie);
		}
		catch (java.io.IOException ex){
			System.out.println("I/O Errors: no such file" + lie);
		}
		while(input.hasNext()){
			lie = input.nextLine();
			r = Pattern.compile("");
			m = r.matcher(lie);
			if(m.find()) break;
			
			if(daoyan || bianju || zhuyan) {
				if(daoyan)
					r = Pattern.compile("director");
				else if(bianju)
					r = Pattern.compile("author");
				else
					r = Pattern.compile("actor");
				
				m = r.matcher(lie);
				if(m.find()) {
					int geshu = 0;
					while(input.hasNext()){
						lie = input.nextLine();
						r = Pattern.compile("]");
						m = r.matcher(lie);
						if(m.find())
							break;
						r = Pattern.compile("\"name\": \"([^\"]*)");
						m = r.matcher(lie);
	
						if(m.find()) {
							geshu++;
							if(daoyan){
								if(geshu>1)
									daoyanname += " / " + m.group(1);
								else
									daoyanname += m.group(1);
							}	
							else if(bianju){
								if(geshu>1)
									bianjuname += " / " + m.group(1);
								else
									bianjuname += m.group(1);	
							}
							else if(zhuyan){
								if(geshu>1)
									zhuyanname += " / " + m.group(1);
								else
									zhuyanname += m.group(1);	
							}
						}
					}
					if(daoyan)
						daoyan = false;
					else if(bianju)
						bianju = false;
					else if(zhuyan)
						zhuyan = false;
				}
			}
			else {
				r = Pattern.compile("\"datePublished\": \"([^\"]*)");
				m = r.matcher(lie);
				if(m.find())
					shangyingriqi = m.group(1);
				if(shangyingriqi.length()>0) {
					r = Pattern.compile("\"ratingCount\": \"([^\"]*)\"");
					m = r.matcher(lie);
					if(m.find())
						pingjiarenshu = m.group(1);
				}
				if(pingjiarenshu.length()>0) {
					r = Pattern.compile("\"ratingValue\": \"([^\"]*)\"");
					m = r.matcher(lie);
					if(m.find())
						pingjiafenshu = m.group(1);
				}
			}
		}
			//获取海报5张即可
	    while(input.hasNext()){
	    	lie = input.nextLine();
	    	r = Pattern.compile("
); m = r.matcher(lie.length()>16?lie.substring(0,16):lie); if(m.find()) { lie = input.nextLine(); r = Pattern.compile("href=\"([^\"]*)"); m = r.matcher(lie); if(m.find()) { haibaourl = m.group(1); gethaibao(haibaourl,10); if(imagebaocun) ImageDownload(); } break; } } while(input.hasNext()){ lie = input.nextLine(); r = Pattern.compile(" 类型:"); m = r.matcher(lie.length()>35?lie.substring(0,35):lie); if(m.find()){ int fromIndex = 0; while(true){ int count1 = lie.indexOf("", fromIndex); if(count1 == -1) break; int count2 = lie.indexOf("", count1); count1+=25; if(fromIndex != 0) leixing += "/" + lie.substring(count1, count2); else leixing += lie.substring(count1, count2); fromIndex = count2; } break; } } //简介 while(input.hasNext()){ lie = input.nextLine(); r = Pattern.compile("); m = r.matcher(lie.length()>55?lie.substring(0,55):lie); if(m.find()) { jianjie = input.nextLine(); input.nextLine(); jianjie += input.nextLine(); break; } } while(input.hasNext()){ lie = input.nextLine(); r = Pattern.compile("
"); m = r.matcher(lie.length()>31?lie.substring(0,31):lie); if(m.find()) { break; } } //获取100条评论,按热度排序的 while(input.hasNext()){ lie = input.nextLine(); r = Pattern.compile("); m = r.matcher(lie); if(m.find()) { duanpingurl = m.group(1); duanpingcount = 100; getduanping(duanpingurl,100); break; } } } /** * 获取短评,内部调用 * @param s * @param shumu */ void getduanping(String s,int shumu) { if(shumu<=0) return; String lie = ""; Scanner input = null; Pattern r; Matcher m; int count_zhang=0; try { URL url = new URL(s); input = new Scanner(url.openStream(),"UTF-8"); } catch (java.net.MalformedURLException ex){ System.out.println("Invalid URL"); } catch (java.io.IOException ex){ System.out.println("I/O Errors: no such file"); } while(input.hasNext()) { lie = input.nextLine(); if(lie.length() == 38 && lie.equals("
")) break; } while(input.hasNext()) { lie = input.nextLine(); r = Pattern.compile("([^<]*)"); m = r.matcher(lie); if(m.find()) { list_duanping.add(m.group(1)); count_zhang++; if(shumu == count_zhang) return; } else { if(lie.equals("
")) break; } } while(input.hasNext()) { lie = input.nextLine(); r = Pattern.compile("后页"); m = r.matcher(lie); if(m.find()) { r = Pattern.compile("href=\"([^\"]*)"); m = r.matcher(lie); if(m.find()){ String xiaye = s.substring(0, s.indexOf('?'))+m.group(1); int weizi; while((weizi = xiaye.indexOf("amp;")) != -1) { xiaye = xiaye.substring(0, weizi) + xiaye.substring(weizi+4,xiaye.length()); } getduanping(xiaye,shumu-count_zhang); } break; } } } /** * 获取海报地址,内部调用 * @param s * @param shumu */ void gethaibao(String s,int shumu) { if(shumu<=0) return; String lie = ""; Scanner input = null; Pattern r; Matcher m; int count_zhang=0; try { URL url = new URL(s); input = new Scanner(url.openStream()); } catch (java.net.MalformedURLException ex){ System.out.println("Invalid URL"); } catch (java.io.IOException ex){ System.out.println("I/O Errors: no such file"); } while(input.hasNext()) { lie = input.nextLine(); if(lie.length() == 22&&lie.equals("
")) break; } while(input.hasNext()) { lie = input.nextLine(); r = Pattern.compile("src=\"([^\"]*)\" />"); m = r.matcher(lie); if(m.find()) { String haibaos = m.group(1).substring(0,m.group(1).length()-4)+".jpg"; list_haibao.add(haibaos); count_zhang++; if(count_zhang == shumu) return; } else { if(lie.equals("")) break; } } while(input.hasNext()) { lie = input.nextLine(); r = Pattern.compile("后页"); m = r.matcher(lie); if(m.find()) { r = Pattern.compile("href=\"([^\"]*)"); m = r.matcher(lie); if(m.find()){ gethaibao(m.group(1),shumu-count_zhang); } break; } } } public void printall(){ System.out.println(getshuju()); } /** * 下载海报图片。内部调用 * @param s * @param t */ void ImageDownload() { try { String filePar = "image/"+zhongwenname +"/";// 文件夹路径 File myPath = new File( filePar ); if (!myPath.exists()){//若此目录不存在,则创建之 myPath.mkdir(); } else return; //如果存在退出,无需重复下载 for(int i=1;i<=list_haibao.size();i++) { URL url = new URL(list_haibao.get(i-1)); DataInputStream dataInputStream = new DataInputStream(url.openStream()); FileOutputStream fileOutputStream = new FileOutputStream(new File(filePar+ zhongwenname + i +".jpg")); ByteArrayOutputStream output = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int length; while ((length = dataInputStream.read(buffer)) > 0) { output.write(buffer, 0, length); } fileOutputStream.write(output.toByteArray()); dataInputStream.close(); fileOutputStream.close(); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public String getshuju() { String s = "\t"; return wangzhan + s + zhongwenname + s + yingwenname + s + daoyanname + s + bianjuname + s + zhuyanname + s + shangyingriqi + s + pingjiarenshu + s + pingjiafenshu + s + leixing + s + jianjie + s + haibaourl + s + duanpingurl + s + duanpingcount; } }

2.设计一个类爬取 豆瓣电影 Top 250上的电影网站。

但不爬取其详细信息,然后把这个网站传给上面的类爬取每个电影的详情。

package Courses.GrabData;

import java.io.File;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 获取豆瓣top250页面的电影及其详情
 * @author Administrator
 *
 */
public class DouBan{
	private ArrayList<BangDan> list_bangdan = new ArrayList<>();
	public ArrayList<DianYingXiangQing> list_xiangqing = new ArrayList<>();
	public int number = 0;	
	/**
	 * 默认爬取250个
	 */
	public DouBan(){
		this(250);
	}
	/**
	 * 爬取n个电影详情
	 * 
	 */
	public DouBan(int n){
		if(n<1)	n=1;
		if(n>250)	n=250;
		
		number = n;
		getbangdan();
	//	printwbangdan();
		getxiangqing();
		//printwxiangqing();
	}
	public void getxiangqing() {
		int i=0;
		int dy_shumu = list_bangdan.size();	
		Thread thread1[] = new Thread[dy_shumu];
		for(BangDan bd: list_bangdan) {
			DianYingXiangQing xq = new DianYingXiangQing(bd.wangzhan);
			thread1[i] = new Thread(xq);
			thread1[i].setDaemon(true);
			thread1[i].setName(bd.zhongwenname);
			thread1[i].start();
			
			list_xiangqing.add(xq);

			i++;
		}
		while(true) {
			boolean b = true;
			for(int j=0;j<dy_shumu;j++) {
				if(thread1[j].isAlive()) {
					b = false;
				}
			}
			if(b)
				break;
		}
	}
	public void  printwxiangqing() {
		File file = new File("电影TOP250详情.txt");
		/*if(file.exists()) {
			System.out.println("Error:\"电影TOP250详情.txt\"文件创建失败,检查该文件是否已存在?");
			System.exit(1);
		}*/
		for(int i=1;file.exists();i++) {
			file = new File("电影TOP250详情"+ i+".txt");
		}
		try {
			PrintWriter output = new PrintWriter(file);
			int i=1;
			for(DianYingXiangQing xq: list_xiangqing) {
				output.println(i + "\t" + xq.getshuju());
				for(String s :xq.list_haibao)
					output.print(s + "\t");
				output.println();
				for(String s :xq.list_duanping)
					output.print(s + "\t");
				output.println();
				i++;
			}
			output.close();
		}
		catch (Exception ex) {
			System.out.println("Error: " + ex.getMessage());
		}
	}
	public void  printwbangdan() {
		File file = new File("电影TOP250.txt");
		if(file.exists()) {
			System.out.println("Error:\"电影TOP250.txt\"文件创建失败,检查该文件是否已存在?");
			System.exit(1);
		}
		try {
			PrintWriter output = new PrintWriter(file);
			for(BangDan b: list_bangdan) {
				output.println(b.getshuju());
			}
			output.close();
		}
		catch (Exception ex) {
			System.out.println("Error: " + ex.getMessage());
		}
	}
	public void getbangdan() {

		String URLString = "https://movie.douban.com/top250?start=" + (list_bangdan.size()/25*25);
		try {
			URL url = new URL(URLString);
			Scanner input = new Scanner(url.openStream(),"UTF-8");
			String lie = "";
			Pattern r;
			Matcher m;

			do {
				lie = input.nextLine();
				r = Pattern.compile("
"); m = r.matcher(lie); }while(input.hasNext() && !m.find()); for(int i=0;i<25;i++) { BangDan dianying = new BangDan(); do { lie = input.nextLine(); r = Pattern.compile("
"); m = r.matcher(lie); }while(input.hasNext() && !m.find()); lie = input.nextLine(); r = Pattern.compile("(\\D*)(\\d*)"); m = r.matcher(lie); if(m.find()) { dianying.paiming = Integer.parseInt(m.group(2)); } lie = input.nextLine(); r = Pattern.compile("]*)"); m = r.matcher(lie); if(m.find()) { dianying.wangzhan = (m.group(1)); } r = Pattern.compile("(\\D*)(\\d*)"); m = r.matcher(dianying.wangzhan); if(m.find()) { dianying.id = (m.group(2)); } lie = input.nextLine(); r = Pattern.compile("([[^[\u4e00-\u9fa5]]]+)([^\"]+)\" src=\"([^\">]*)"); m = r.matcher(lie); if(m.find()) { dianying.zhongwenname = (m.group(2)); dianying.fengmian = m.group(3); } do { lie = input.nextLine(); r = Pattern.compile(""); m = r.matcher(lie); }while(input.hasNext() && !m.find()); lie = input.nextLine(); r = Pattern.compile("/ ([^<]+)"); m = r.matcher(lie); if(m.find()) { dianying.yingwenname = (m.group(1)); } //导演,主演 do { lie = input.nextLine(); r = Pattern.compile("导演"); m = r.matcher(lie); }while(input.hasNext() && !m.find()); r = Pattern.compile(" 导演: (.*)   主演: (.*)...
"
); m = r.matcher(lie); if(m.find()) { dianying.daoyan = (m.group(1)); dianying.zhuyan = (m.group(2)); } else { r = Pattern.compile(" 导演: (.*)  &"); m = r.matcher(lie); if(m.find()) { dianying.daoyan = (m.group(1)); } } //获取摘要 Matcher m2; do { lie = input.nextLine(); r = Pattern.compile(""); m = r.matcher(lie); Pattern r2 = Pattern.compile(" ");; m2 = r2.matcher(lie);; }while(input.hasNext() && !m.find() && !m2.find()); r = Pattern.compile("([^<]+)"); m = r.matcher(lie); if(m.find()) { dianying.zhaiyao = (m.group(1)); } //dianying.printall(); list_bangdan.add(dianying); if(list_bangdan.size() == number) return; } getbangdan(); } catch (java.net.MalformedURLException ex){ System.out.println("Invalid URL"); } catch (java.io.IOException ex){ System.out.println("I/O Errors: no such file"); } catch (java.lang.NumberFormatException ex){ System.out.println("null"); } } } class BangDan { //排行榜排名,网站、电影中文名, 电影英文名,电影id,电影导演,电影主演,电影封面、摘要 public int paiming; public String wangzhan = ""; public String zhongwenname = ""; public String yingwenname = ""; public String id = ""; public String daoyan = ""; public String zhuyan = ""; public String fengmian = ""; public String zhaiyao = ""; String getshuju() { String s = "\t"; return paiming + s + wangzhan + s + zhongwenname + s + yingwenname + s + id + s + daoyan + s + zhuyan + s + fengmian + s + zhaiyao; } void printall(){ System.out.println(paiming + "," + wangzhan + "," + zhongwenname + "," + yingwenname + "," + id + "," + daoyan + "," + zhuyan + "," + fengmian + "," + zhaiyao); } }

3.设计一个UI界面,把爬取的信息显示出来。

UI界面显示爬取的信息,除此之外也是程序开始的地方。

package Courses.UI;

import Courses.GrabData.DianYingXiangQing;
import Courses.GrabData.DouBan;
import Selenium.HelloSelenium;
import javafx.application.Application;
import javafx.geometry.Orientation;
import javafx.scene.Scene;
import javafx.scene.control.Button;
import javafx.scene.control.Label;
import javafx.scene.control.TextField;
import javafx.scene.image.Image;
import javafx.scene.image.ImageView;
import javafx.scene.layout.FlowPane;
import javafx.scene.layout.Pane;
import javafx.scene.text.*;
import javafx.stage.Stage;
import javafx.scene.paint.Color;
import javafx.scene.control.TextArea;
/**
 * UI界面,也是程序开始的地方
 * @author Administrator
 * 2019年12月8日
 */
public class WebCrawlerUI extends Application {

	static double imageHeight = 550;	//海报高度
	static double imagelayouH = 450;	//海报距顶层垂直距离
	
	int count_num = 20;			//获取多少个电影
	static String dianyingwangzhan = "";	//搜索框输入的电影网站
	DianYingXiangQing dianying;
	int num = 0;				//第num个电影
	int haibao_count = 1;		//海报计数
	int duanping_count = 0;		//评论计数
	int xianshipinglun = 5;		//一页显示多少个评论
	DouBan douban;
	ImageView image;
	TextArea  textjianjie;
	Text   	textname;
	Text   	texturl;
	Text	textriqi;
	Text	textlx;
	Text	textfs;
	Text	text_pinglunyema;
	Text	text_haibaoyema;
	Text	text_dianyingyema;
	
	TextArea []pinglun = new TextArea[5];
	public static void main(String[] args) {
		launch();
	}
	public void start (Stage primaryStage) {
		Pane pane = new Pane();

		System.out.println("正在加载豆瓣电影TOP250!!!");
		System.out.println("共加载" + count_num + "个电影详情");

		douban = new DouBan(count_num);
		dianying = douban.list_xiangqing.get(num);
		
		//添加图片
		image = new ImageView();
		image.setFitHeight(imageHeight);
		image.setPreserveRatio(true);//保留比例
		image.setSmooth(true);//光滑
		image.setCache(true);//缓存
		image.setLayoutY(imagelayouH);
		pane.getChildren().add(image);
		
		//左上角搜索框
		FlowPane flow = new FlowPane();
		Text text1 = new Text("	  请在下框中输入豆瓣电影网址:");

		text1.setScaleX(1.5);
		text1.setScaleY(1.5);
		flow.getChildren().add(new Text());
		flow.getChildren().add(text1);
		
		TextField dyurl = new TextField("例:https://movie.douban.com/subject/26794435/");
		dyurl.setMinWidth(500);
		flow.getChildren().add(dyurl);
		
		//左上角top250文字提示
		flow.getChildren().add(new Label(""));
		Text text2 = new Text("	   点击下面按钮显示豆瓣评分TOP250:");
		text2.setScaleX(1.5);
		text2.setScaleY(1.5);
		text2.setFill(Color.BLUE);
		flow.getChildren().add(text2);
		
		//左上角完成
		flow.setVgap(10);
		flow.setHgap(4);
		flow.setOrientation(Orientation.VERTICAL);
		pane.getChildren().add(flow);
		
		//添加确认按钮
		Button butten = new Button("确认");
		butten.setLayoutX(520);
		butten.setLayoutY(60);
		butten.setOnMouseClicked( e -> {
			dianyingwangzhan = dyurl.getText();
			UIUpdate(0);

		});
		pane.getChildren().add(butten);
		

		//添加上一页按钮
		Button buttenUp = new Button("上一页");
		buttenUp.setLayoutX(50);
		buttenUp.setLayoutY(180);
		buttenUp.setOnMouseClicked( e -> {
			UIUpdate(-1);

		});
		pane.getChildren().add(buttenUp);
		
		//添加下一页按钮
		Button buttenDown = new Button("下一页");
		buttenDown.setLayoutX(300);
		buttenDown.setLayoutY(180);
		buttenDown.setOnMouseClicked( e -> {
			UIUpdate(1);

		});
		pane.getChildren().add(buttenDown);
		
		//电影页码
		text_dianyingyema = new Text("第" + (num + 1) + "页/共" + douban.list_xiangqing.size() + "页");
		text_dianyingyema.setLayoutX(160);
		text_dianyingyema.setLayoutY(200);
		text_dianyingyema.setScaleX(1.5);
		text_dianyingyema.setScaleY(1.5);
		pane.getChildren().add(text_dianyingyema);
		
		
		//电影简介
		Text  text3 = new Text ("    简介:");
		text3.setLayoutY(250);
		text3.setFill(Color.RED);
		text3.setScaleX(1.5);
		text3.setScaleY(1.5);
		pane.getChildren().add(text3);
		
		textjianjie = new TextArea (dianying.jianjie);
		textjianjie.setLayoutY(270);
		textjianjie.setEditable(false);
		//textjianjie.setTextFill(Color.RED);
		textjianjie.setWrapText(true);
		pane.getChildren().add(textjianjie);
		
		//电影名字
		Text  text4 = new Text ("    影名:");
		text4.setLayoutY(50);
		text4.setLayoutX(800);
		text4.setFill(Color.RED);
		text4.setScaleX(1.5);
		text4.setScaleY(1.5);
		pane.getChildren().add(text4);
		
		textname = new Text  (dianying.zhongwenname);
		textname.setLayoutY(50);
		textname.setLayoutX(900);
		textname.setScaleX(1.5);
		textname.setScaleY(1.5);
		textname.setFill(Color.RED);
		pane.getChildren().add(textname);
		
		//电影网站
		Text  text5 = new Text ("豆瓣网址:");
		text5.setLayoutY(50);
		text5.setLayoutX(1200);
		text5.setScaleX(1.5);
		text5.setScaleY(1.5);
		pane.getChildren().add(text5);
		
		texturl = new Text  (dianying.wangzhan);
		texturl.setLayoutY(50);
		texturl.setLayoutX(1400);
		texturl.setScaleX(1.5);
		texturl.setScaleY(1.5);
		pane.getChildren().add(texturl);
		
		//电影类型
		Text  text6 = new Text ("类型:");
		text6.setLayoutY(100);
		text6.setLayoutX(820);
		text6.setScaleX(1.5);
		text6.setScaleY(1.5);
		pane.getChildren().add(text6);
		
		textlx = new Text  (dianying.leixing);
		textlx.setLayoutY(100);
		textlx.setLayoutX(900);
		textlx.setScaleX(1.5);
		textlx.setScaleY(1.5);
		pane.getChildren().add(textlx);
		
		//上映日期
		Text  text7 = new Text ("上映日期:");
		text7.setLayoutY(100);
		text7.setLayoutX(1200);
		text7.setScaleX(1.5);
		text7.setScaleY(1.5);
		pane.getChildren().add(text7);
		
		textriqi = new Text  (dianying.shangyingriqi);
		textriqi.setLayoutY(100);
		textriqi.setLayoutX(1350);
		textriqi.setScaleX(1.5);
		textriqi.setScaleY(1.5);
		pane.getChildren().add(textriqi);
		//电影评分
		Text  text8 = new Text ("评分:");
		text8.setLayoutY(150);
		text8.setLayoutX(820);
		text8.setScaleX(1.5);
		text8.setScaleY(1.5);
		pane.getChildren().add(text8);
		
		textfs = new Text  (dianying.pingjiafenshu);
		textfs.setLayoutY(150);
		textfs.setLayoutX(900);
		textfs.setScaleX(1.5);
		textfs.setScaleY(1.5);
		pane.getChildren().add(textfs);
		
		//电影短评
		Text  text9 = new Text ("短评:");
		text9.setLayoutY(200);
		text9.setLayoutX(820);
		text9.setScaleX(1.5);
		text9.setScaleY(1.5);
		pane.getChildren().add(text9);
		
		
		text_pinglunyema = new Text("第" +(duanping_count/xianshipinglun + 1)+ "页/" + "共" + dianying.duanpingcount/xianshipinglun + "页");
		text_pinglunyema.setLayoutX(1050);
		text_pinglunyema.setLayoutY(820);
		text_pinglunyema.setScaleX(1.5);
		text_pinglunyema.setScaleY(1.5);
		pane.getChildren().add(text_pinglunyema);
		
		for(int i=0;i<5;i++) {
			int jianju = 100;
			pinglun[i] = new TextArea  (dianying.list_duanping.get(i + duanping_count));
			pinglun[i].setLayoutY(250 + i*jianju );
			pinglun[i].setLayoutX(820);
			pinglun[i].setMaxHeight(100);
			pinglun[i].setWrapText(true);
			pinglun[i].setEditable(false);
			pane.getChildren().add(pinglun[i]);
		}
		//短评下一页按钮
		Button butten_duanping_down = new Button("下一页短评");
		butten_duanping_down.setLayoutX(1200);
		butten_duanping_down.setLayoutY(800);
		butten_duanping_down.setOnMouseClicked( e -> {
			if(duanping_count>=dianying.duanpingcount - xianshipinglun)
				return;
			duanping_count += xianshipinglun;
			setDuanping();
		});
		pane.getChildren().add(butten_duanping_down);
		

		
		//短评上一页按钮
		Button butten_duanping_up = new Button("上一页短评");
		butten_duanping_up.setLayoutX(900);
		butten_duanping_up.setLayoutY(800);
		butten_duanping_up.setOnMouseClicked( e -> {
			if(duanping_count<xianshipinglun)
				return;
			duanping_count -= xianshipinglun;
			setDuanping();
		});
		pane.getChildren().add(butten_duanping_up);
		
		//海报下一页按钮
		double x = 450;
		Button butten_haibao_down = new Button("下一个海报");
		butten_haibao_down.setLayoutX(x);
		butten_haibao_down.setLayoutY(620);
		butten_haibao_down.setOnMouseClicked( e -> {
			
			if(haibao_count>=dianying.list_haibao.size())
				return;
			haibao_count++;
			setImage(haibao_count);
		});
		pane.getChildren().add(butten_haibao_down);
		
		//海报上一页按钮
		Button butten_haibao_up = new Button("上一个海报");
		butten_haibao_up.setLayoutX(x);
		butten_haibao_up.setLayoutY(540);
		butten_haibao_up.setOnMouseClicked( e -> {
			if(haibao_count<=1)
				return;
			haibao_count--;
			setImage(haibao_count);
		});
		pane.getChildren().add(butten_haibao_up);
		//海报页码
		text_haibaoyema = new Text("第" +haibao_count+ "页/" + "共" + dianying.list_haibao.size() + "页");
		text_haibaoyema.setLayoutX(x);
		text_haibaoyema.setLayoutY(600);
		text_haibaoyema.setScaleX(1.5);
		text_haibaoyema.setScaleY(1.5);
		pane.getChildren().add(text_haibaoyema);
				
		setImage(1);
		
		
		
		Scene scene = new Scene(pane);
		primaryStage.setMaximized(true);	//最大化窗口
		primaryStage.setTitle("网页爬虫");
		primaryStage.setScene(scene);
		primaryStage.show();
		
	}
	/**
	 * 界面更新,每次点击确认按钮时
	 */
	void UIUpdate(int annv) {
		if(annv==0) {
			int index1 = dianyingwangzhan.indexOf("https://movie.douban.com/subject/");
			int index2 = dianyingwangzhan.indexOf("/",index1 + 33);
			
			if(index1<0 || index2<0)
			{
				dianyingwangzhan = HelloSelenium.getSearchUrl(dianyingwangzhan);
				System.out.println("正在搜索电影网站电影网站" + dianyingwangzhan);
			}
			else dianyingwangzhan = dianyingwangzhan.substring(index1, index2+1);
			
			boolean baohan = false;
			
			for(int i=0;i<douban.list_xiangqing.size();i++){
				if(dianyingwangzhan.equals(douban.list_xiangqing.get(i).wangzhan)) {
					baohan = true;
					num = i;
					break;
				}
			}
			
			if(baohan)
				dianying = douban.list_xiangqing.get(num);
			else {
				dianying = new DianYingXiangQing(dianyingwangzhan);
				dianying.run();
				num = douban.list_xiangqing.size();
				douban.list_xiangqing.add(dianying);
			}
		}
		else {
			if(annv==-1)
				--num;
			else
				++num;
			
			if(num >= douban.list_xiangqing.size()) {
				num = 0;
			}
			if(num<0)
				num = douban.list_xiangqing.size()-1;
			
			dianying = douban.list_xiangqing.get(num);
		}
		haibao_count = 1;
		setImage(haibao_count);
		SetSummary();
	}
	/**
	 * 修改摘要,电影名等信息
	 */
	void SetSummary(){
		textjianjie.setText(dianying.jianjie);
		textname.setText(dianying.zhongwenname);
		texturl.setText(dianying.wangzhan);
		textlx.setText(dianying.leixing);
		textriqi.setText(dianying.shangyingriqi);
		textfs.setText(dianying.pingjiafenshu);
		duanping_count = 0;

		text_dianyingyema.setText("第" + (num + 1) + "页/共" + douban.list_xiangqing.size() + "页");
		setDuanping();
		
	}
	/**
	 * 修改短评
	 */  
	void setDuanping() {
		text_pinglunyema.setText("第" +(duanping_count/xianshipinglun + 1)+ "页/" + "共" + dianying.duanpingcount/xianshipinglun + "页");
		for(int i=0;i<xianshipinglun;i++) {
			pinglun[i].setText(dianying.list_duanping.get(duanping_count + i));
		}
	}
	/**
	 * 修改海报
	 * @param n
	 */
	void setImage(int n){
		
		text_haibaoyema.setText("第" + haibao_count+ "页/" + "共" + dianying.list_haibao.size() + "页");
		String name = dianying.zhongwenname;
		if(dianying.imagebaocun)
			image.setImage(new Image("file:image/" + name+"/" +  name+ n+ ".jpg"));
		else
			image.setImage(new Image(dianying.list_haibao.get(n-1)));
	}
}

4.Selenium类爬取网页,可以实现全网搜索。

需要支持Selenium驱动支持和安装Chrome浏览器。
如果不需要本功能可把UI类中的关于Selenium的两行代码都删除,位置大概是第7行和342行。
程序就可以保持其他功能不受影响。

package Selenium;

import java.util.List;

import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;



/**
 * 模拟浏览器行为,爬取豆瓣电影搜索内容
 * 一般URL无法爬取,因为豆瓣把搜索的内容都隐藏了
 * @author Administrator
 *
 */
public class HelloSelenium {
	
    public static void main(String[] args) {
    	getSearchUrl("小丑");
    }
	
    public static String getSearchUrl(String name)
    {
        System.setProperty("webdriver.chrome.driver","chromedriver.exe");//自动启动谷歌浏览器驱动
        
      /*  ChromeOptions options = new ChromeOptions();
		options.addArguments("--headless");
        WebDriver driver = new ChromeDriver(options);*/
    	WebDriver driver = new ChromeDriver();
        driver.navigate().to("https://search.douban.com/movie/subject_search?search_text=" + name + "&cat=1002");
        
        try {
			Thread.sleep(20000);
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
        List<WebElement> signOuts = driver.findElements(By.className("title-text"));
        String url = "";
        for(WebElement signOut:signOuts){
        	 url = signOut.getAttribute("href");
        	 if(url.indexOf("subject") != -1){
        		 break;
        	 }
        }
        System.out.println("url:" + url);
		//signOut.click();	//点击
       
        
        driver.quit();	//退出浏览器
        return url;
    }
}

你可能感兴趣的:(JAVA基础)