获取页面音频资料和字幕文件,格式分别为MP3和lrc.
页面地址为:
https://www.51voa.com/VOA_Special_English/researchers-call-for-a-use-tax-to-clean-up-space-84650.html
页面的部分源码如下:
分析结果:
href=”xxx.mp3”和href=”xxx.lrc”这两个是我们可以用到的数据,是对应格式文件的下载链接。
来通过网站域名URL获取该网站的源码,得到标题和mp3、lrc下载链接。
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
/**
* 通过网站域名URL获取该网站的源码
*
* @author Administrator
*
*/
public class HtmlRequest {
/** */
/**
* 通过网站域名URL获取该网站的源码
*
* @param url
* @return String
* @throws Exception
*/
public String getURLSource(URL url) throws Exception {
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setConnectTimeout(5 * 1000);
InputStream inStream = conn.getInputStream(); // 通过输入流获取html二进制数据
byte[] data = readInputStream(inStream); // 把二进制数据转化为byte字节数据
String htmlSource = new String(data);
return htmlSource;
}
/** */
/**
* 把二进制流转化为byte字节数组
*
* @param instream
* @return byte[]
* @throws Exception
*/
public byte[] readInputStream(InputStream instream) throws Exception {
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
byte[] buffer = new byte[1204];
int len = 0;
while ((len = instream.read(buffer)) != -1) {
outStream.write(buffer, 0, len);
}
instream.close();
return outStream.toByteArray();
}
/**
* 从html源码(字符串)中去掉标题
* @param htmlSource
* @return
*/
public String getTitle(String htmlSource){
List list = new ArrayList();
String title = "";
//Pattern pa = Pattern.compile(".*? ", Pattern.CANON_EQ);也可以
Pattern pa = Pattern.compile(".*? ");//源码中标题正则表达式
Matcher ma = pa.matcher(htmlSource);
while (ma.find())//寻找符合el的字串
{
list.add(ma.group());//将符合el的字串加入到list中
}
for (int i = 0; i < list.size(); i++)
{
title = title + list.get(i);
}
return outTag(title);
}
/**
* 去掉html源码中的标签
* @param s
* @return
*/
public String outTag(String s)
{
return s.replaceAll("<.*?>", "");
}
}
import java.io.File;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] args) throws Exception {
HtmlRequest httpRequest=new HtmlRequest();
Scanner sc = new Scanner(System.in);
String[] strs=sc.next().split(",");
for(int i=0;i
实现多线程下载
其中DownloadManager的实现参数分别是
1.保存路径,2.线程数,3.下载链接
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class DownloadManager implements Runnable {
// 保存路径
private String savePath;
// 总的下载线程数
private int threadNum;
// 下载的链接地址
private String urlFile;
// 是否下载开始
private boolean isStarted;
// 用于监视何时合并文件存放Thread的list
private List downloadList = new ArrayList();
public DownloadManager(String savePath, int threadNum, String urlFile) {
super();
this.savePath = savePath;
this.threadNum = threadNum;
this.urlFile = urlFile;
}
// 最终调用线程下载。本线程中调用分线程。
public void action() {
new Thread(this).start();
}
public void run() {
long t1 = System.currentTimeMillis();
System.out.println(t1);
// 如果没有下载 , 就开始 , 并且将已经下载的变量值设为true
if (!isStarted) {
startDownload();
isStarted = true;
}
while (true) {
// 初始化认为所有线程下载完成,逐个检查
boolean finish = true;
// 如果有任何一个没完成,说明下载没完成,不能合并文件
for (DownloadThread thread : downloadList) {
if (!thread.isFinish()) {
finish = false;
break;
}
}
// 全部下载完成才为真
if (finish) {
// 合并文件
mergeFiles();
// 跳出循环 , 下载结束
break;
}
// 休息一会 , 减少cpu消耗
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
long t2 = System.currentTimeMillis();
System.out.println(t2);
System.out.println("下载用时:" + (t2 -t1));
}
public void startDownload() {
// 得到每个线程开始值 , 下载字节数大小
int[][] posAndLength = getPosAndLength();
// 根据下载信息创建每个下载线程,并且启动他们。
for (int i = 0; i < posAndLength.length; i++) {
int pos = posAndLength[i][0];
int length = posAndLength[i][1];
DownloadThread downloadThread = new DownloadThread(i + 1, length,
pos, savePath, urlFile);
new Thread(downloadThread).start();
downloadList.add(downloadThread);
}
}
/**
* 获得文件大小
*
* @return 文件大小
*/
public long getFileLength() {
System.out.println("获得文件大小 start......");
HttpURLConnection conn = null;
long result = 0;
try {
URL url = new URL(urlFile);
conn = (HttpURLConnection) url.openConnection();
// 使用Content-Length头信息获得文件大小
result = Long.parseLong(conn.getHeaderField("Content-Length"));
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (conn != null) {
conn.disconnect();
}
}
System.out.println("获得文件大小 end......" + result);
return result;
}
// 具体细节求出每个线程的开始位置和文件下载大小
public int[][] getPosAndLength() {
int[][] result = new int[threadNum][2];
int fileLength = (int) getFileLength();
int every = fileLength % threadNum == 0 ? fileLength / threadNum
: fileLength / threadNum + 1;
for (int i = 0; i < result.length; i++) {
int length = 0;
if (i != result.length - 1) {
length = every;
} else {
length = fileLength - i * every;
}
result[i][0] = i * every;
result[i][1] = length;
}
return result;
}
// 合并文件
public void mergeFiles() {
System.out.println("合并文件 start......");
OutputStream out = null;
try {
out = new FileOutputStream(savePath);
for (int i = 1; i <= threadNum; i++) {
InputStream in = new FileInputStream(savePath + i);
byte[] bytes = new byte[2048];
int read = 0;
while ((read = in.read(bytes)) != -1) {
out.write(bytes, 0, read);
out.flush();
}
if (in != null) {
in.close();
new File(savePath + i).delete();
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (out != null) {
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.out.println("合并文件 end......");
}
public String getSavePath() {
return savePath;
}
public void setSavePath(String savePath) {
this.savePath = savePath;
}
public int getThreadNum() {
return threadNum;
}
public void setThreadNum(int threadNum) {
this.threadNum = threadNum;
}
public String getUrlFile() {
return urlFile;
}
public void setUrlFile(String urlFile) {
this.urlFile = urlFile;
}
public boolean isStarted() {
return isStarted;
}
public void setStarted(boolean isStarted) {
this.isStarted = isStarted;
}
public List getDownloadList() {
return downloadList;
}
public void setDownloadList(List downloadList) {
this.downloadList = downloadList;
}
}
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class DownloadThread implements Runnable {
// 当前第几个线程 , 用于给下载文件起名 file1 file2 file3 ...
private int whichThread;
// 监听单一线程下载是否完成
private boolean isFinish;
// 本线程要下载的文件字节数
private int length;
// 本线程向服务器发送请求时输入流的首位置
private int startPosition;
// 保存的路径
private String savePath;
// 要下载的文件 , 用于创建连接
private String url;
public void run() {
HttpURLConnection conn = null;
InputStream in = null;
OutputStream out = null;
try {
System.out.println("正在执行的线程:" + whichThread);
URL fileUrl = new URL(url);
// 与服务器创建连接
conn = (HttpURLConnection) fileUrl.openConnection();
// 下载使用get请求
conn.setRequestMethod("GET");
// 告诉服务器 , 我是火狐 , 不要不让我下载。
conn.setRequestProperty(
"User-Agent",
"Firefox Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3");
// 这里是设置文件输入流的首位置
conn.setRequestProperty("Range", "bytes=" + startPosition + "-");
// 与服务器创建连接
conn.connect();
// 获得输入流
in = conn.getInputStream();
// 在硬盘上创建file1 , file2 , ...这样的文件 , 准备往里面写东西
out = new FileOutputStream(savePath + whichThread);
// 用于写入的字节数组
byte[] bytes = new byte[4096];
// 一共下载了多少字节
int count = 0;
// 单次读取的字节数
int read = 0;
while ((read = in.read(bytes)) != -1) {
// 检查一下是不是下载到了本线程需要的长度
if (length - count < bytes.length) {
// 比如说本线程还需要900字节,但是已经读取1000
// 字节,则用要本线程总下载长度减去
// 已经下载的长度
read = length - count;
}
// 将准确的字节写入输出流
out.write(bytes, 0, read);
// 已经下载的字节数加上本次循环字节数
count = count + read;
// 如果下载字节达到本线程所需要字节数,消除循环,
// 停止下载
if (count == length) {
break;
}
}
// 将监视变量设置为true
isFinish = true;
} catch (Exception e) {
e.printStackTrace();
} finally {
// 最后进行输入、输出、连接的关闭
if (in != null) {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (out != null) {
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (conn != null) {
conn.disconnect();
}
}
}
public int getStartPosition() {
return startPosition;
}
public void setStartPosition(int startPosition) {
this.startPosition = startPosition;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public int getWhichThread() {
return whichThread;
}
public void setWhichThread(int whichThread) {
this.whichThread = whichThread;
}
public int getLength() {
return length;
}
public void setLength(int length) {
this.length = length;
}
public String getSavePath() {
return savePath;
}
public void setSavePath(String savePath) {
this.savePath = savePath;
}
public DownloadThread(int whichThread, int length, int startPosition,
String savePath, String url) {
super();
this.whichThread = whichThread;
this.length = length;
this.startPosition = startPosition;
this.savePath = savePath;
this.url = url;
}
public DownloadThread() {
super();
}
public boolean isFinish() {
return isFinish;
}
public void setFinish(boolean isFinish) {
this.isFinish = isFinish;
}
}
按回车后就可以在main方法里指定的目录看到下载的文件了。
需要下载其他页面的其他文件只需要修改页面链接、正则部分,以及文件格式后缀,正则部分可以根据需要增删,也就是下面圈出来的那部分:
本文有部分代码来自:
https://ljlleo.iteye.com/blog/1397765