用Java抓取CSDN主页上的图片

一,步骤一:获取网页源码

1,定义要爬取的页面的URL对象

//定义即将访问的链接
String url="http://www.csdn.net";
//获取CSDN的URL对象
URL realURL = new URL(url);

2,获得这个链接的一个连接对象

URLConnection connection = realURL.openConnection();

        3,开始连接

connection.connect();

4,将连接的输入流转化成BufferedReader输入流,通过BufferedReader对象存放到一个String对象中

in = new BufferedReader(new InputStreamReader(connection.getInputStream()));		
String line="";
while((line=in.readLine())!=null){
	result+=line+"\n";
}
二,步骤二:正则匹配,存储图片

1,找出正则表达式的样板

Pattern pattern = Pattern.compile("<img \\S+\\ssrc=\"(.+?)\"");

2,匹配对象

Matcher matcher = pattern.matcher(result);

3,实例化文件输出流

imgFile= new FileOutputStream("D:\\CSDN"+i+".png");

4,获得图片链接的缓冲流
URL imgURL = new URL(img);
URLConnection imgConnection = imgURL.openConnection();
imgConnection.connect();
bufferedImage = (BufferedInputStream) new BufferedInputStream(imgConnection.getInputStream());

5,将缓冲流写入文件中

while((size=bufferedImage.read(buf))!=-1){
<span style="white-space:pre">	</span>imgFile.write(buf, 0, size);
}
					

6,将输入输出流关闭

in.close();
imgFile.close();
bufferedImage.close();


程序的完整源码:

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CrawOne {

	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		//定义即将访问的链接
		String url="http://www.csdn.net";
		//定义一个用来存储的字符串
		String result="";
		//用于存储网页源代码
		BufferedReader in = null;
		//用于存储图片链接
		String img=null;
		int i=0;
		int size=0;
		//定义每一次读取缓冲区的最大值
		int BUFFER_SIZE = 1024;  
		byte[] buf = new byte[BUFFER_SIZE]; 
		//图片文件输出流
		FileOutputStream imgFile=null;
		//缓冲输入流
		BufferedInputStream bufferedImage =null;
		try {
			//获取CSDN的URL对象
			URL realURL = new URL(url);
			//获得这个链接的一个连接对象
			URLConnection connection = realURL.openConnection();
			//开始连接
			connection.connect();
			//将连接的输入流转化成BufferedReader输入流
			in = new BufferedReader(new InputStreamReader(connection.getInputStream()));		
			String line="";
			while((line=in.readLine())!=null){
				result+=line+"\n";
			}
			
			//找出正则表达式的样板
			Pattern pattern = Pattern.compile("<img \\S+\\ssrc=\"(.+?)\"");
			
			//匹配对象
			Matcher matcher = pattern.matcher(result);

			while(matcher.find()){
				System.out.println("find one");
				//获得匹配的值,由于只定义了一个群组,即正则表达式中打括号的组数,所以是group(1),group(0)表示的是整个匹配
				img = matcher.group(1);
				//实例化文件输出流
				imgFile= new FileOutputStream("D:\\CSDN"+i+".png");
				i++;
				//获得图片链接的缓冲流
				URL imgURL = new URL(img);
				URLConnection imgConnection = imgURL.openConnection();
				imgConnection.connect();
				bufferedImage = (BufferedInputStream) new BufferedInputStream(imgConnection.getInputStream());
				//将缓冲流写入文件中
				while((size=bufferedImage.read(buf))!=-1){
					imgFile.write(buf, 0, size);
				}
					
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}finally {
			if (in!=null) {
				//将输入输出流关闭
				in.close();
				imgFile.close();
				bufferedImage.close();
			}
		}
		System.out.println("Get some picture.");
	}

}



你可能感兴趣的:(java,爬虫,url,nio)