一,步骤一:获取网页源码
1,定义要爬取的页面的URL对象
//定义即将访问的链接 String url="http://www.csdn.net"; //获取CSDN的URL对象 URL realURL = new URL(url);
URLConnection connection = realURL.openConnection();
connection.connect();
in = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line=""; while((line=in.readLine())!=null){ result+=line+"\n"; }二,步骤二:正则匹配,存储图片
1,找出正则表达式的样板
Pattern pattern = Pattern.compile("<img \\S+\\ssrc=\"(.+?)\"");
Matcher matcher = pattern.matcher(result);
imgFile= new FileOutputStream("D:\\CSDN"+i+".png");
URL imgURL = new URL(img); URLConnection imgConnection = imgURL.openConnection(); imgConnection.connect(); bufferedImage = (BufferedInputStream) new BufferedInputStream(imgConnection.getInputStream());
5,将缓冲流写入文件中
while((size=bufferedImage.read(buf))!=-1){ <span style="white-space:pre"> </span>imgFile.write(buf, 0, size); }
6,将输入输出流关闭
in.close(); imgFile.close(); bufferedImage.close();
import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; public class CrawOne { public static void main(String[] args) throws IOException { // TODO Auto-generated method stub //定义即将访问的链接 String url="http://www.csdn.net"; //定义一个用来存储的字符串 String result=""; //用于存储网页源代码 BufferedReader in = null; //用于存储图片链接 String img=null; int i=0; int size=0; //定义每一次读取缓冲区的最大值 int BUFFER_SIZE = 1024; byte[] buf = new byte[BUFFER_SIZE]; //图片文件输出流 FileOutputStream imgFile=null; //缓冲输入流 BufferedInputStream bufferedImage =null; try { //获取CSDN的URL对象 URL realURL = new URL(url); //获得这个链接的一个连接对象 URLConnection connection = realURL.openConnection(); //开始连接 connection.connect(); //将连接的输入流转化成BufferedReader输入流 in = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line=""; while((line=in.readLine())!=null){ result+=line+"\n"; } //找出正则表达式的样板 Pattern pattern = Pattern.compile("<img \\S+\\ssrc=\"(.+?)\""); //匹配对象 Matcher matcher = pattern.matcher(result); while(matcher.find()){ System.out.println("find one"); //获得匹配的值,由于只定义了一个群组,即正则表达式中打括号的组数,所以是group(1),group(0)表示的是整个匹配 img = matcher.group(1); //实例化文件输出流 imgFile= new FileOutputStream("D:\\CSDN"+i+".png"); i++; //获得图片链接的缓冲流 URL imgURL = new URL(img); URLConnection imgConnection = imgURL.openConnection(); imgConnection.connect(); bufferedImage = (BufferedInputStream) new BufferedInputStream(imgConnection.getInputStream()); //将缓冲流写入文件中 while((size=bufferedImage.read(buf))!=-1){ imgFile.write(buf, 0, size); } } } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { if (in!=null) { //将输入输出流关闭 in.close(); imgFile.close(); bufferedImage.close(); } } System.out.println("Get some picture."); } }