Java爬虫-爬取页面源码并下载页面的指定格式文件

 

一、客户需求

获取页面音频资料和字幕文件,格式分别为MP3和lrc.

页面地址为:

https://www.51voa.com/VOA_Special_English/researchers-call-for-a-use-tax-to-clean-up-space-84650.html

 

二、分析页面源码

 

页面的部分源码如下:

Java爬虫-爬取页面源码并下载页面的指定格式文件_第1张图片

Java爬虫-爬取页面源码并下载页面的指定格式文件_第2张图片

 

 

分析结果

       href=”xxx.mp3”href=”xxx.lrc”这两个是我们可以用到的数据,是对应格式文件的下载链接。xxx是下载文件的标题。

 

 

三、实现过程

java代码部分,目录结构如下图:

Java爬虫-爬取页面源码并下载页面的指定格式文件_第3张图片

 

HtmlRequest.java

来通过网站域名URL获取该网站的源码,得到标题和mp3、lrc下载链接。

import java.io.ByteArrayOutputStream;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;



/**

 * 通过网站域名URL获取该网站的源码

 *

 * @author Administrator

 *

 */

public class HtmlRequest {

    /** */
    /**
     * 通过网站域名URL获取该网站的源码
     *
     * @param url
     * @return String
     * @throws Exception
     */
    public String getURLSource(URL url) throws Exception {
        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        conn.setRequestMethod("GET");
        conn.setConnectTimeout(5 * 1000);
        InputStream inStream = conn.getInputStream(); // 通过输入流获取html二进制数据
        byte[] data = readInputStream(inStream); // 把二进制数据转化为byte字节数据
        String htmlSource = new String(data);
        return htmlSource;
    }

    /** */
    /**
     * 把二进制流转化为byte字节数组
     *
     * @param instream
     * @return byte[]
     * @throws Exception
     */
    public byte[] readInputStream(InputStream instream) throws Exception {
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        byte[] buffer = new byte[1204];
        int len = 0;
        while ((len = instream.read(buffer)) != -1) {
            outStream.write(buffer, 0, len);
        }
        instream.close();
        return outStream.toByteArray();
    }


    /**
     * 从html源码(字符串)中去掉标题
     * @param htmlSource
     * @return
     */
    public String getTitle(String htmlSource){
        List list = new ArrayList();
        String title = "";

        //Pattern pa = Pattern.compile(".*?", Pattern.CANON_EQ);也可以
        Pattern pa = Pattern.compile(".*?");//源码中标题正则表达式
        Matcher ma = pa.matcher(htmlSource);
        while (ma.find())//寻找符合el的字串
        {
            list.add(ma.group());//将符合el的字串加入到list中
        }
        for (int i = 0; i < list.size(); i++)
        {
            title = title + list.get(i);
        }
        return outTag(title);
    }

    /**
     * 去掉html源码中的标签
     * @param s
     * @return
     */
    public String outTag(String s)
    {
        return s.replaceAll("<.*?>", "");
    }

}

 

 

Main.java

 


 

import java.io.File;

import java.net.URL;

import java.util.Scanner;

import java.util.regex.Matcher;

import java.util.regex.Pattern;


public class Main {

    public static void main(String[] args) throws Exception {
        HtmlRequest httpRequest=new HtmlRequest();
        Scanner sc = new Scanner(System.in);
        String[] strs=sc.next().split(",");
        for(int i=0;i

DownloadManager.java

实现多线程下载

其中DownloadManager的实现参数分别是

1.保存路径,2.线程数,3.下载链接


 

import java.io.File; 

import java.io.FileInputStream; 

import java.io.FileOutputStream; 

import java.io.IOException; 

import java.io.InputStream; 

import java.io.OutputStream; 

import java.net.HttpURLConnection; 

import java.net.MalformedURLException; 

import java.net.URL; 

import java.util.ArrayList; 

import java.util.List; 

 

public class DownloadManager implements Runnable { 

    // 保存路径 

    private String savePath; 

    // 总的下载线程数 

    private int threadNum; 

    // 下载的链接地址 

    private String urlFile; 

    // 是否下载开始 

    private boolean isStarted; 

    // 用于监视何时合并文件存放Thread的list 

    private List downloadList = new ArrayList(); 

 

    public DownloadManager(String savePath, int threadNum, String urlFile) { 

        super(); 

        this.savePath = savePath; 

        this.threadNum = threadNum; 

        this.urlFile = urlFile; 

    } 

 

    // 最终调用线程下载。本线程中调用分线程。 

    public void action() { 

        new Thread(this).start(); 

    } 

 

    public void run() { 

        long t1 = System.currentTimeMillis(); 

        System.out.println(t1); 

        // 如果没有下载 , 就开始 , 并且将已经下载的变量值设为true 

        if (!isStarted) { 

            startDownload(); 

            isStarted = true; 

        } 

        while (true) { 

            // 初始化认为所有线程下载完成,逐个检查 

            boolean finish = true; 

            // 如果有任何一个没完成,说明下载没完成,不能合并文件 

            for (DownloadThread thread : downloadList) { 

                if (!thread.isFinish()) { 

                    finish = false; 

                    break; 

                } 

            } 

            // 全部下载完成才为真 

            if (finish) { 

                // 合并文件 

                mergeFiles(); 

                // 跳出循环 , 下载结束 

                break; 

            } 

            // 休息一会 , 减少cpu消耗 

            try { 

                Thread.sleep(1000); 

            } catch (InterruptedException e) { 

                e.printStackTrace(); 

            } 

        } 

        long t2 = System.currentTimeMillis(); 

        System.out.println(t2); 

         System.out.println("下载用时:" + (t2 -t1)); 

    } 

 

    public void startDownload() { 

        // 得到每个线程开始值 , 下载字节数大小 

        int[][] posAndLength = getPosAndLength(); 

        // 根据下载信息创建每个下载线程,并且启动他们。 

        for (int i = 0; i < posAndLength.length; i++) { 

            int pos = posAndLength[i][0]; 

            int length = posAndLength[i][1]; 

            DownloadThread downloadThread = new DownloadThread(i + 1, length, 

                    pos, savePath, urlFile); 

            new Thread(downloadThread).start(); 

            downloadList.add(downloadThread); 

        } 

    } 

 

    /**

     * 获得文件大小

     * 

     * @return 文件大小

     */ 

    public long getFileLength() { 

        System.out.println("获得文件大小  start......"); 

        HttpURLConnection conn = null; 

        long result = 0; 

        try { 

            URL url = new URL(urlFile); 

            conn = (HttpURLConnection) url.openConnection(); 

            // 使用Content-Length头信息获得文件大小 

            result = Long.parseLong(conn.getHeaderField("Content-Length")); 

        } catch (MalformedURLException e) { 

            e.printStackTrace(); 

        } catch (IOException e) { 

            e.printStackTrace(); 

        } finally { 

            if (conn != null) { 

                conn.disconnect(); 

            } 

        } 

        System.out.println("获得文件大小  end......" + result); 

        return result; 

    } 

 

    // 具体细节求出每个线程的开始位置和文件下载大小 

    public int[][] getPosAndLength() { 

        int[][] result = new int[threadNum][2]; 

        int fileLength = (int) getFileLength(); 

        int every = fileLength % threadNum == 0 ? fileLength / threadNum 

                : fileLength / threadNum + 1; 

        for (int i = 0; i < result.length; i++) { 

            int length = 0; 

            if (i != result.length - 1) { 

                length = every; 

            } else { 

                length = fileLength - i * every; 

            } 

            result[i][0] = i * every; 

            result[i][1] = length; 

        } 

        return result; 

    } 

 

    // 合并文件 

    public void mergeFiles() { 

        System.out.println("合并文件  start......"); 

        OutputStream out = null; 

        try { 

            out = new FileOutputStream(savePath); 

            for (int i = 1; i <= threadNum; i++) { 

                InputStream in = new FileInputStream(savePath + i); 

                byte[] bytes = new byte[2048]; 

                int read = 0; 

                while ((read = in.read(bytes)) != -1) { 

                    out.write(bytes, 0, read); 

                    out.flush(); 

                } 

                if (in != null) {  

                    in.close(); 

                    new File(savePath + i).delete(); 

                } 

            } 

        } catch (Exception e) { 

            e.printStackTrace(); 

        } finally { 

            if (out != null) { 

                try { 

                    out.close(); 

                } catch (IOException e) { 

                    e.printStackTrace(); 

                } 

            } 

        } 

        System.out.println("合并文件  end......"); 

    } 

 

    public String getSavePath() { 

        return savePath; 

    } 

 

    public void setSavePath(String savePath) { 

        this.savePath = savePath; 

    } 

 

    public int getThreadNum() { 

        return threadNum; 

    } 

 

    public void setThreadNum(int threadNum) { 

        this.threadNum = threadNum; 

    } 

 

    public String getUrlFile() { 

        return urlFile; 

    } 

 

    public void setUrlFile(String urlFile) { 

        this.urlFile = urlFile; 

    }  

 

    public boolean isStarted() { 

        return isStarted; 

    } 

 

    public void setStarted(boolean isStarted) { 

        this.isStarted = isStarted; 

    } 

 

    public List getDownloadList() { 

        return downloadList; 

    } 

 

    public void setDownloadList(List downloadList) { 

        this.downloadList = downloadList; 

    } 

} 

DownloadThread.java

import java.io.FileOutputStream; 

import java.io.IOException; 

import java.io.InputStream; 

import java.io.OutputStream; 

import java.net.HttpURLConnection; 

import java.net.URL; 



public class DownloadThread implements Runnable { 

    // 当前第几个线程 , 用于给下载文件起名 file1 file2 file3 ... 

    private int whichThread; 

    // 监听单一线程下载是否完成 

    private boolean isFinish; 

    // 本线程要下载的文件字节数 

    private int length; 

    // 本线程向服务器发送请求时输入流的首位置 

    private int startPosition; 

    // 保存的路径 

    private String savePath; 

    // 要下载的文件 , 用于创建连接 

    private String url; 



    public void run() { 

        HttpURLConnection conn = null; 

        InputStream in = null; 

        OutputStream out = null; 

        try { 

            System.out.println("正在执行的线程:" + whichThread); 

            URL fileUrl = new URL(url); 

            // 与服务器创建连接 

            conn = (HttpURLConnection) fileUrl.openConnection(); 

            // 下载使用get请求 

            conn.setRequestMethod("GET"); 

            // 告诉服务器 , 我是火狐 , 不要不让我下载。 

            conn.setRequestProperty( 

                            "User-Agent", 

                            "Firefox Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3"); 

            // 这里是设置文件输入流的首位置 

            conn.setRequestProperty("Range", "bytes=" + startPosition + "-"); 

            // 与服务器创建连接 

            conn.connect(); 

            // 获得输入流 

            in = conn.getInputStream(); 

            // 在硬盘上创建file1 , file2 , ...这样的文件 , 准备往里面写东西 

            out = new FileOutputStream(savePath + whichThread); 

            // 用于写入的字节数组 

            byte[] bytes = new byte[4096]; 

            // 一共下载了多少字节 

            int count = 0; 

            // 单次读取的字节数 

            int read = 0; 

            while ((read = in.read(bytes)) != -1) { 

                // 检查一下是不是下载到了本线程需要的长度 

                if (length - count < bytes.length) { 

                    // 比如说本线程还需要900字节,但是已经读取1000 

                    // 字节,则用要本线程总下载长度减去 

                    // 已经下载的长度 

                    read = length - count; 

                } 

                // 将准确的字节写入输出流 

                out.write(bytes, 0, read); 

                // 已经下载的字节数加上本次循环字节数 

                count = count + read; 

                // 如果下载字节达到本线程所需要字节数,消除循环, 

                // 停止下载 

                if (count == length) { 

                    break; 

                } 

            } 

            // 将监视变量设置为true 

            isFinish = true; 

        } catch (Exception e) { 

            e.printStackTrace(); 

        } finally { 

            // 最后进行输入、输出、连接的关闭 

            if (in != null) { 

                try { 

                    in.close(); 

                } catch (IOException e) { 

                    e.printStackTrace(); 

                } 

            } 

            if (out != null) { 

                try { 

                    out.close(); 

                } catch (IOException e) { 

                    e.printStackTrace(); 

                } 

            } 

            if (conn != null) { 

                conn.disconnect(); 

            } 

        } 

    } 



    public int getStartPosition() { 

        return startPosition; 

    } 



    public void setStartPosition(int startPosition) { 

        this.startPosition = startPosition; 

    } 



    public String getUrl() { 

        return url; 

    } 



    public void setUrl(String url) { 

        this.url = url; 

    } 



    public int getWhichThread() { 

        return whichThread; 

    } 



    public void setWhichThread(int whichThread) { 

        this.whichThread = whichThread; 

    } 



    public int getLength() { 

        return length; 

    } 



    public void setLength(int length) { 

        this.length = length; 

    } 



    public String getSavePath() { 

        return savePath; 

    } 



    public void setSavePath(String savePath) { 

        this.savePath = savePath; 

    } 



    public DownloadThread(int whichThread, int length, int startPosition, 

            String savePath, String url) { 

        super(); 

        this.whichThread = whichThread; 

        this.length = length; 

        this.startPosition = startPosition; 

        this.savePath = savePath; 

        this.url = url; 

    } 



    public DownloadThread() { 

        super(); 

    } 



    public boolean isFinish() { 

        return isFinish; 

    } 



    public void setFinish(boolean isFinish) { 

        this.isFinish = isFinish; 

    } 

} 

 

按回车后就可以在main方法里指定的目录看到下载的文件了。

需要下载其他页面的其他文件只需要修改页面链接、正则部分,以及文件格式后缀,正则部分可以根据需要增删,也就是下面圈出来的那部分:

 

本文有部分代码来自:

 

https://ljlleo.iteye.com/blog/1397765

 

你可能感兴趣的:(线上问题分析)