通过javaurl实现可以爬去简单视频
爬虫可以爬去网页和图片,使用jsoup作为底层实现
可以进行二次开发
环境:javase,eclipse
jar:jsoup
maven:
org.jsoup
jsoup
1.11.3
爬虫对象
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import study.core.Reflex;
/**
* 爬虫对象
* @author Administrator
*@since 0.0.1
*/
public class Reptile {
/**
* 请求头对象
*@since 0.0.2
*/
class Header implements Reflex{
/**
* 身份标识
* @since 0.0.2
*/
private String userAgent;
public String getUserAgent() {
return userAgent;
}
public void setUserAgent(String userAgent) {
this.userAgent = userAgent;
}
}
/**
* 当前上下文对象
*/
protected Reptile context=null;
/**
* 爬虫身份标识
*/
private String userAgent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
/**
* 设置连接超时
*/
private Integer timeOut=5000;
/**
* 设置读取超时
* @since 0.0.2
*/
private Integer readTimeOut=5000;
/**
* 设置请求类型 默认get
* @since 0.0.2
*/
private String requestType="get";
/**
* 请求头
* @since 0.0.2
*/
private Header header=new Header();
public Header getHeader() {
return header;
}
public void setHeader(Header header) {
this.header = header;
}
/**
* 返回相应码
*/
private Integer code=0;
private Reptile() {
context=this;
}
/**
* 获取爬虫对象
* @return
*/
public static Reptile instance() {
return new Reptile();
}
/**
* 获取相应状态码
* @return
*/
public Integer code() {
return code;
}
/**
* 用户身份标识
* @param userAgent
* @return
*/
public Reptile userAgent(String userAgent) {
this.userAgent=userAgent;
return context;
}
/**
* 设置连接超时
* @param timeOut
* @return
*/
public Reptile timeOut(Integer timeOut) {
this.timeOut=timeOut;
return context;
}
/**
* 设置读取超时
* @param readTimeOut
* @return
* @since 0.0.2
*/
public Reptile readTimeOut(Integer readTimeOut) {
this.readTimeOut=readTimeOut;
return context;
}
/**
* 设置请求类型
* @param type
* @return
* @since 0.0.2
*/
public Reptile requestType(String type) {
this.requestType=type;
return context;
}
/**
*
* @param url
* @param path
* @param type null text 文本
* @return
* @throws Exception
*/
public Object download(final String url, final String path,final String type,String charset) throws Exception {
//url
//判断url是否是http协议
if(url!=null && url.length()<4) {
throw new Exception("");
}
if(url.indexOf("http")==-1) {
throw new Exception("");
}
Connection connect = Jsoup.connect(url);
connect.userAgent(userAgent);
Response response = connect.timeout(timeOut).ignoreContentType(true).execute();
BufferedInputStream inputStream = response.bodyStream();
code = response.statusCode();
if( code==200 && inputStream!=null) {
if(type==null) {
return inputStream;
}else if(type!=null && type.equals("text")){
return Tool.save(path, getContentByInputStream(inputStream, charset), charset);
}else {
return Tool.save(path, inputStream);
}
}
return null;
}
/**
* 通过url获取输入流
* @param url
* @return
* @throws Exception
*/
public InputStream download(final String url) throws Exception {
return (InputStream) context.download(url, "", "", "");
}
/**
* 下载文本
* @param url
* @param path
* @param charset
* @return
* @throws Exception
*/
public File download(final String url,final String path,String charset) throws Exception {
return (File) context.download(url, path, "text", charset);
}
/**
* 下载文件
* @param url
* @param path
* @return
* @throws Exception
*/
public File download(final String url,final String path) throws Exception {
return (File) context.download(url, path, "type","");
}
/**
* 通过输入流获取字符串
* @param inputStream 输入流
* @param charSet 字符集
* @return
* @throws IOException
*/
public synchronized String getContentByInputStream(InputStream inputStream,String charSet) throws IOException {
InputStreamReader inputStreamReader = new InputStreamReader(inputStream,charSet);
char[] buff=new char[1024];
int l=0;
StringBuffer sb=new StringBuffer();
while((l=inputStreamReader.read(buff))!=-1) {
sb.append(buff,0,l);
}
inputStreamReader.close();
return sb.toString();
}
/**
* 内核
* @return
* @throws IOException
* @since 0.0.2
*/
@SuppressWarnings("static-access")
public HttpURLConnection kernel(URL url ) throws Exception {
if(url==null) {
throw new Exception("url为null"+url);
}
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setConnectTimeout(timeOut);
connection.setReadTimeout(readTimeOut);
connection.setRequestMethod(requestType.toUpperCase());
this.header.setUserAgent(userAgent);
@SuppressWarnings("unchecked")
Map properties = (Map)this. header.properties();
Set set = properties.keySet();
Iterator it = set.iterator();
while (it.hasNext()) {
String name = (String) it.next();
if(name.equals("this$0")) {
continue;
}
if(name.equals("userAgent")) {
connection.addRequestProperty("UserAgent", (String)properties.get("userAgent"));
}else {
connection.addRequestProperty(name, (String)properties.get(name));
}
}
connection.connect();
if(connection.getResponseCode()==connection.HTTP_OK) {
return connection;
}
return null;
}
/**
*文本
* @param url
* @param charSet
* @return
* @throws Exception
*/
public String kernelText(URL url ,String charSet) throws Exception {
HttpURLConnection connection = context.kernel(url);
if(connection!=null) {
InputStream inputStream = connection.getInputStream();
return context.getContentByInputStream(inputStream, charSet);
}
return null;
}
/**
* 保存文本
* @param url
* @param charSet
* @param path
* @return
* @throws Exception
*/
public File kernelFile(URL url ,String charSet,String path) throws Exception {
String content = context.kernelText(url, charSet);
if(content==null || (content!=null && content.length()==0)) {
throw new Exception("内容无效无法保存:"+content);
}
return Tool.save(path, content, charSet);
}
/**
* 保存文件
* @param url
* @param path
* @return
* @throws Exception
*/
public File kernelFile(URL url ,String path) throws Exception{
HttpURLConnection connection = context.kernel(url);
if(connection!=null) {
InputStream inputStream = connection.getInputStream();
return Tool.save(path, inputStream);
}
return null;
}
/**
* 下载大型文件
* @param url
* @param path
* @param progress 进度条
* @return
* @throws Exception
*/
@SuppressWarnings("resource")
public File kernelBigFile(URL url ,String path,boolean progress) throws Exception{
HttpURLConnection connection = context.kernel(url);
if(connection!=null) {
int length = connection.getContentLength();
InputStream inputStream = connection.getInputStream();
File file = Tool.mkdirs(path);
FileOutputStream outputStream = new FileOutputStream(file);
byte[] data=new byte[1024*1024];
int len=0;
int sum=0;
while((len=inputStream.read(data))!=-1) {
outputStream.write(data, 0, len);
sum+=len;
double dLength=length;
double dSum=sum;
if(progress)
System.out.println("已下载:"+String.format("%.3f", dSum/dLength*100)+"%");
outputStream.flush();
}
outputStream.close();
inputStream.close();
return file;
}
return null;
}
/**
* 下载大型文件 有进度条
* @param url
* @param path
* @return
* @throws Exception
*/
public File kernelBigFile(URL url ,String path) throws Exception{
return context.kernelBigFile(url, path, true);
}
public File kernelBigFile(String url ,String path) throws Exception{
return context.kernelBigFile(new URL(url), path);
}
/**
* 通过线程下载大文件
* @deprecated
* @param url
* @param path
* @return
* @throws Exception
*/
public File kernelThread(String url ,String path) throws Exception{
HttpURLConnection connection = kernel(new URL(url));
if(connection==null)
Tool.e("链接对象不存在");
long length = connection.getContentLengthLong();
int mb=10*1024*1024;
Integer threadCount=(int) (length/mb);
File file = Tool.mkdirs(path);
RandomAccessFile mAccessFile=new RandomAccessFile(file, "rwd");//"rwd"可读,可写
mAccessFile.setLength(length);//占据文件的空间
int size=(int) (length/threadCount);
for (int id = 0; id < threadCount; id++) {
//1、确定每个线程的下载区间
//2、开启对应子线程下载
long startIndex=id*size;
long endIndex=(id+1)*size-1;
if (id==threadCount-1) {
endIndex=length-1l;
}
System.out.println("第"+id+"个线程的下载区间为"+startIndex+"--"+endIndex);
new DownLoadThread(startIndex, endIndex,url, id).setPath(path).start();
}
return null ;
}
/**
* 下载页面
* @param pageUrl 页面连接
* @param charSet 字符集 默认utf-8
* @param html html解析器
* @param paths 保存父文件路径
* @param downloader 下载器
* @return
* @throws Exception
*/
public File kernelSingle(String pageUrl,String charSet,Parsing html,String paths,Downloader downloader) throws Exception {
if(Tool.isNull(pageUrl))
Tool.e("页面连接地址无效:"+pageUrl);
if(Tool.isNull(charSet))
charSet="utf-8";
String text = kernelText(new URL(pageUrl), charSet);
if(Tool.isNull(text))
Tool.e("不能解析,字符串为null"+text);
if(html==null)
Tool.e("解析对象无效或者不存在:"+html);
Set urls = html.urls(text);
if(urls==null || (urls!=null && urls.size()==0))
Tool.e("没有网络地址集"+urls.size());
Iterator iterator = urls.iterator();
while (iterator.hasNext()) {
String url = (String) iterator.next();
if (Tool.isNull(url))
Tool.e("读取地址不存在,不能解析");
if(url.contains("thunder"))//迅雷地址
url=Base64.conver(url);
String name="";
String path="";
if(url.contains("/"))
name=url.substring(url.lastIndexOf("/")+1, url.length());
if(Tool.isNull(paths))
Tool.e("保存路径不存在:"+paths);
if (paths.endsWith("/"))
path=paths+name;
else
path=paths+"/"+name;
if(downloader!=null)
downloader.download(url, path);
}
return new File(paths);
}
/**
*工具对象
* @author Administrator
*
*/
public static class Tool{
/**
* 保存输入流到本地
* @param path
* @param input
* @return
* @throws Exception
*
*/
public static File save(String path,InputStream input) throws Exception {
if(input==null) {
e("不能保存输入流,输入流不存在:"+input);
}
File createFile = mkdirs(path);
FileOutputStream fileOutputStream=new FileOutputStream(createFile);
int len=0;
byte[] data=new byte[1024];
while((len=input.read(data))!=-1) {
fileOutputStream.write(data, 0, len);
fileOutputStream.flush();
}
fileOutputStream.close();
input.close();
return createFile;
}
/**
* 保存字符串到本地
* @param path 保存路径
* @param content 保存内容
* @param charset 字符集 (默认utf-8)
* @return
* @throws Exception
*/
public static File save(String path,String content,String charset) throws Exception {
File createFile = mkdirs(path);
if(content==null || (content!=null && content.length()==0)) {
e("内容不存在,不能保存:"+content);
}
if(isNull(charset)) {
charset="utf-8";
}
FileOutputStream outFile=new FileOutputStream(createFile);
outFile.write(content.getBytes(charset));
outFile.flush();
outFile.close();
return createFile;
}
/**
* mkdirs
* @return
* @throws Exception
* @since 0.0.2
*/
public static File mkdirs(String path) throws Exception {
if(path==null || (path!=null && path.length()==0)) {
e("不能创建文件或者路径,地址不存在:"+path);
}
File createFile = new File(path);
File file = createFile.getParentFile();
if(file.exists()==false) {
file.mkdirs();
}
return createFile;
}
/**
* 判断字符串是否为null 是返回true
* @return
*/
public static boolean isNull(String content) {
return content==null || (content!=null && content.length()==0);
}
/**
* 创建简单异常
* @param msg 异常信息
* @throws Exception
*/
public static void e(String msg) throws Exception {
throw new Exception(msg);
}
}
/**
* html解析对象
* @author Administrator
*
*/
public interface Parsing{
/**
* 解析urls
* @param page
* @return
*/
Set urls(String page);
}
/**
* 下载线程
* @deprecated
*/
class DownLoadThread extends Thread{
long startIndex;
private int threadId;
long endIndex;
private String urlString;
private String path;
public String getPath() {
return path;
}
public DownLoadThread setPath(String path) {
this.path = path;
return this;
}
public DownLoadThread(long startIndex, long endIndex,String url, int id) {
this.endIndex=endIndex;
this.startIndex=startIndex;
this.urlString=url;
this.threadId=id;
}
@Override
public void run() {
try {
URL url=new URL(urlString);
HttpURLConnection conn=(HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setConnectTimeout(1000*30);
conn.setReadTimeout(1000*30);
conn.setRequestProperty("Range", "bytes="+startIndex+"-"+endIndex);//设置头信息属性,拿到指定大小的输入流
if (conn.getResponseCode()==206) {//拿到指定大小字节流,由于拿到的使部分的指定大小的流,所以请求的code为206
InputStream is=conn.getInputStream();
File file =new File(this.path);
RandomAccessFile mAccessFile=new RandomAccessFile(file, "rwd");//"rwd"可读,可写
mAccessFile.seek(startIndex);//表示从不同的位置写文件
byte[] bs=new byte[1024];
int len=0;
int current=0;
while ((len=is.read(bs))!=-1) {
mAccessFile.write(bs,0,len);
current+=len;
double dCurrent=current;
double dstartIndex=startIndex;
System.out.println("第"+threadId+"个线程下载了"+current+"进度:"+String.format("%.3f", dCurrent/dstartIndex*100)+"%");
}
mAccessFile.close();
System.out.println("第"+threadId+"个线程下载完毕");
}
} catch (Exception e) {
e.printStackTrace();
}
super.run();
}
}
/**
* 下载器
* @author Administrator
*
*/
interface Downloader{
/**
* 下载
* @param url 网络地址
* @param path 保存路径
* @return
*/
public Object download(String url,String path);
}
}
base64
import java.io.*;
/**
* Base64 编码和解码。
*
* @author 宋立君
* @date 2014年07月03日
*/
public class Base64 {
public Base64() {
}
/**
* 功能:编码字符串
*
* @author 宋立君
* @date 2014年07月03日
* @param data
* 源字符串
* @return String
*/
public static String encode(String data) {
return new String(encode(data.getBytes()));
}
/**
* 功能:解码字符串
*
* @author 宋立君
* @date 2014年07月03日
* @param data
* 源字符串
* @return String
*/
public static String decode(String data) {
return new String(decode(data.toCharArray()));
}
/**
* 功能:编码byte[]
*
* @author 宋立君
* @date 2014年07月03日
* @param data
* 源
* @return char[]
*/
public static char[] encode(byte[] data) {
char[] out = new char[((data.length + 2) / 3) * 4];
for (int i = 0, index = 0; i < data.length; i += 3, index += 4) {
boolean quad = false;
boolean trip = false;
int val = (0xFF & (int) data[i]);
val <<= 8;
if ((i + 1) < data.length) {
val |= (0xFF & (int) data[i + 1]);
trip = true;
}
val <<= 8;
if ((i + 2) < data.length) {
val |= (0xFF & (int) data[i + 2]);
quad = true;
}
out[index + 3] = alphabet[(quad ? (val & 0x3F) : 64)];
val >>= 6;
out[index + 2] = alphabet[(trip ? (val & 0x3F) : 64)];
val >>= 6;
out[index + 1] = alphabet[val & 0x3F];
val >>= 6;
out[index + 0] = alphabet[val & 0x3F];
}
return out;
}
/**
* 功能:解码
*
* @author 宋立君
* @date 2014年07月03日
* @param data
* 编码后的字符数组
* @return byte[]
*/
public static byte[] decode(char[] data) {
int tempLen = data.length;
for (int ix = 0; ix < data.length; ix++) {
if ((data[ix] > 255) || codes[data[ix]] < 0) {
--tempLen; // ignore non-valid chars and padding
}
}
// calculate required length:
// -- 3 bytes for every 4 valid base64 chars
// -- plus 2 bytes if there are 3 extra base64 chars,
// or plus 1 byte if there are 2 extra.
int len = (tempLen / 4) * 3;
if ((tempLen % 4) == 3) {
len += 2;
}
if ((tempLen % 4) == 2) {
len += 1;
}
byte[] out = new byte[len];
int shift = 0; // # of excess bits stored in accum
int accum = 0; // excess bits
int index = 0;
// we now go through the entire array (NOT using the 'tempLen' value)
for (int ix = 0; ix < data.length; ix++) {
int value = (data[ix] > 255) ? -1 : codes[data[ix]];
if (value >= 0) { // skip over non-code
accum <<= 6; // bits shift up by 6 each time thru
shift += 6; // loop, with new bits being put in
accum |= value; // at the bottom.
if (shift >= 8) { // whenever there are 8 or more shifted in,
shift -= 8; // write them out (from the top, leaving any
out[index++] = // excess at the bottom for next iteration.
(byte) ((accum >> shift) & 0xff);
}
}
}
// if there is STILL something wrong we just have to throw up now!
if (index != out.length) {
throw new Error("Miscalculated data length (wrote " + index
+ " instead of " + out.length + ")");
}
return out;
}
/**
* 功能:编码文件
*
* @author 宋立君
* @date 2014年07月03日
* @param file
* 源文件
*/
public static void encode(File file) throws IOException {
if (!file.exists()) {
System.exit(0);
}
else {
byte[] decoded = readBytes(file);
char[] encoded = encode(decoded);
writeChars(file, encoded);
}
file = null;
}
/**
* 功能:解码文件。
*
* @author 宋立君
* @date 2014年07月03日
* @param file
* 源文件
* @throws IOException
*/
public static void decode(File file) throws IOException {
if (!file.exists()) {
System.exit(0);
} else {
char[] encoded = readChars(file);
byte[] decoded = decode(encoded);
writeBytes(file, decoded);
}
file = null;
}
//
// code characters for values 0..63
//
private static char[] alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
.toCharArray();
//
// lookup table for converting base64 characters to value in range 0..63
//
private static byte[] codes = new byte[256];
static {
for (int i = 0; i < 256; i++) {
codes[i] = -1;
// LoggerUtil.debug(i + "&" + codes[i] + " ");
}
for (int i = 'A'; i <= 'Z'; i++) {
codes[i] = (byte) (i - 'A');
// LoggerUtil.debug(i + "&" + codes[i] + " ");
}
for (int i = 'a'; i <= 'z'; i++) {
codes[i] = (byte) (26 + i - 'a');
// LoggerUtil.debug(i + "&" + codes[i] + " ");
}
for (int i = '0'; i <= '9'; i++) {
codes[i] = (byte) (52 + i - '0');
// LoggerUtil.debug(i + "&" + codes[i] + " ");
}
codes['+'] = 62;
codes['/'] = 63;
}
private static byte[] readBytes(File file) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] b = null;
InputStream fis = null;
InputStream is = null;
try {
fis = new FileInputStream(file);
is = new BufferedInputStream(fis);
int count = 0;
byte[] buf = new byte[16384];
while ((count = is.read(buf)) != -1) {
if (count > 0) {
baos.write(buf, 0, count);
}
}
b = baos.toByteArray();
} finally {
try {
if (fis != null)
fis.close();
if (is != null)
is.close();
if (baos != null)
baos.close();
} catch (Exception e) {
System.out.println(e);
}
}
return b;
}
private static char[] readChars(File file) throws IOException {
CharArrayWriter caw = new CharArrayWriter();
Reader fr = null;
Reader in = null;
try {
fr = new FileReader(file);
in = new BufferedReader(fr);
int count = 0;
char[] buf = new char[16384];
while ((count = in.read(buf)) != -1) {
if (count > 0) {
caw.write(buf, 0, count);
}
}
} finally {
try {
if (caw != null)
caw.close();
if (in != null)
in.close();
if (fr != null)
fr.close();
} catch (Exception e) {
System.out.println(e);
}
}
return caw.toCharArray();
}
private static void writeBytes(File file, byte[] data) throws IOException {
OutputStream fos = null;
OutputStream os = null;
try {
fos = new FileOutputStream(file);
os = new BufferedOutputStream(fos);
os.write(data);
} finally {
try {
if (os != null)
os.close();
if (fos != null)
fos.close();
} catch (Exception e) {
System.out.println(e);
}
}
}
private static void writeChars(File file, char[] data) throws IOException {
Writer fos = null;
Writer os = null;
try {
fos = new FileWriter(file);
os = new BufferedWriter(fos);
os.write(data);
} finally {
try {
if (os != null)
os.close();
if (fos != null)
fos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
String url="thunder://QUFodHRwOi8veHVubGVpLnp1aWRheHVubGVpLmNvbS8xOTAyL0jplafkvKAtMjUubXA0Wlo=";
System.out.println("迅雷地址:"+url);
url=Base64.conver(url);
System.out.println("普通地址:"+url);
}
public static String conver(String url){
String newUrl="";
//s=s.substring(int begin,int end);截取s中从begin开始至end结束时的字符串,并将其赋值给s;
//去掉迅雷地址前缀
url=url.substring(10, url.length());
//解密
newUrl=Base64.decode(url);
//去掉头AA,尾ZZ
newUrl=newUrl.substring(2, newUrl.length()-2);
return newUrl;
}
}
httpclent maven
commons-codec
commons-codec
1.11
commons-logging
commons-logging
1.2
org.apache.httpcomponents
fluent-hc
4.5.7
org.apache.httpcomponents
httpclient
4.5.7
org.apache.httpcomponents
httpclient-cache
4.5.7
org.apache.httpcomponents
httpclient-osgi
4.5.7
org.apache.httpcomponents
httpclient-win
4.5.7
org.apache.httpcomponents
httpcore
4.4.11
org.apache.httpcomponents
httpmime
4.5.7
net.java.dev.jna
jna
4.5.2
net.java.dev.jna
jna-platform
4.5.2