源:http://blog.csdn.net/zcwfengbingdongguke/article/details/6519351
评:
package fdl;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.util.EntityUtils;
public class HttpClientTest {
public static void main(String[] args) throws Exception {
// String url =
// "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=nihao+&aq=f&aqi=&aql=&oq=&gs_rfai=";
// String url =
// "http://119.167.216.6/2/10/69/026c47784babc076affa6a90b04c60bb-f4v-h264-aac-180-32-84120.0-2279819-1298991430078-0f91d62cb920c4876dec925922952da5-1-00-00-00.f4v?vid=26210754&lp=8082&lroot=/1&kfd=1&srchost=119.167.213.25&srcroot=/5&s=1&tm=1299139200&key=5586154467d837f7319a91283408157e&lr=0&nlh=0&check=1&diskid=2&id=ku6_vod&usrip=114.246.175.86&uloc=1.1.2&ipsm=1&ext=.f4v";
String url = "http://www.bhtv.cc:81/**b/2010/02/08/1.flv";
init(url);
}
public static HttpResponse init(String url) throws Exception {
// 初始化,此处构造函数就与3.1中不同
HttpClient httpclient = new DefaultHttpClient();
// HttpHost targetHost = new HttpHost("3g.youku.com");
HttpGet httpget = new HttpGet(url);
// HttpGet httpget = new HttpGet("/");
// 查看默认request头部信息
System.out.println("Accept-Charset:"
+ httpget.getFirstHeader("Accept-Charset"));
// 以下这条如果不加会发现无论你设置Accept-Charset为gbk还是utf-8,他都会默认返回gb2312(本例针对google.cn来说)
httpget.setHeader("User-Agent",
// "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413");
RquestHeader.IE.valueOf());
// "Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.70");
// 用逗号分隔显示可以同时接受多种编码
httpget.setHeader("Accept-Language", "zh-cn");
httpget.setHeader("Accept-Encoding", "gzip, deflate");
httpget.setHeader("Connection", "Keep-Alive");
httpget
.setHeader(
"Accept",
"image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/QVOD, application/QVOD, */*");
// httpget.setHeader("Referer",
// "http://www.bhtv.cc/");
// Execute HTTP request
System.out.println("executing request " + httpget.getURI());
// HttpResponse response = httpclient.execute(targetHost, httpget);
HttpResponse response = httpclient.execute(httpget);
System.out.println("----------------------------------------");
System.out.println("Location: " + response.getLastHeader("Location"));
System.out.println(response.getStatusLine().getStatusCode());
System.out.println(response.getLastHeader("Content-Type"));
System.out.println(response.getLastHeader("Content-Length"));
System.out.println("----------------------------------------");
// 判断页面返回状态判断是否进行转向抓取新链接
int statusCode = response.getStatusLine().getStatusCode();
System.out.println("statusCode: " + statusCode);
httpclient = new DefaultHttpClient();
response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
File file = new File("d:/my.flv");
OutputStream os = new FileOutputStream(file);
byte[] bytes = EntityUtils.toByteArray(entity);
os.write(bytes, 0, bytes.length);
// System.out.println(bytes);
return response;
}
public static String baseHttp(HttpResponse response, String url)
throws Exception {
long start = System.currentTimeMillis();
HttpClient httpclient = new DefaultHttpClient();
// Get hold of the response entity
HttpEntity entity = response.getEntity();
// 查看所有返回头部信息
Header headers[] = response.getAllHeaders();
int ii = 0;
while (ii < headers.length) {
System.out.println(headers[ii].getName() + ": "
+ headers[ii].getValue());
++ii;
}
System.out.println("----------------------------------------");
// If the response does not enclose an entity, there is no need
// to bother about connection release
if (entity != null) {
// 将源码流保存在一个byte数组当中,因为可能需要两次用到该流,
byte[] bytes = EntityUtils.toByteArray(entity);
String charSet = "";
// 如果头部Content-Type中包含了编码信息,那么我们可以直接在此处获取
charSet = EntityUtils.getContentCharSet(entity);
System.out.println("In header: " + charSet);
// 如果头部中没有,那么我们需要 查看页面源码,这个方法虽然不能说完全正确,因为有些粗糙的网页编码者没有在页面中写头部编码信息
if (charSet == "" || charSet == null) {
String regEx_html = "(?=<meta).+?(?<=charset=['/"]?)([//w-]+)(?=['/"//s+])";
String regEx_xml = "(?=<//?xml).+?(?<=encoding=['/"]?)([//w-]+)(?=['/"//s+])";
Pattern p_html = Pattern.compile(regEx_html,
Pattern.CASE_INSENSITIVE);
Pattern p_xml = Pattern.compile(regEx_xml,
Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(new String(bytes)); // 默认编码转成字符串,因为我们的匹配中无中文,所以串中可能的乱码对我们没有影响
if (m_html.find())
charSet = m_html.group(1);
else {
Matcher m_xml = p_xml.matcher(new String(bytes)); // 默认编码转成字符串,因为我们的匹配中无中文,所以串中可能的乱码对我们没有影响
if (m_xml.find())
charSet = m_xml.group(1);
else
charSet = "GBK";
}
}
if (charSet.toUpperCase().startsWith("GB")) // 处理汉字编码集过小情况
charSet = "GBK";
System.out.println("Last get: " + charSet);
// 至此,我们可以将原byte数组按照正常编码专成字符串输出(如果找到了编码的话)
String txt = new String(bytes, charSet);
// System.out.println("Encoding string is:/n" + txt);
long end = System.currentTimeMillis();
System.out.println("Cost time is: " + (end - start) / 1000.00
+ " s.");
return txt;
}
httpclient.getConnectionManager().shutdown();
httpclient.getConnectionManager();
return "";
}
public static String simpleHttpClient(String url) throws Exception {
long start = System.currentTimeMillis();
// 初始化,此处构造函数就与3.1中不同
HttpClient httpclient = new DefaultHttpClient();
// 这里的http.socket.timeout相当于SO_TIMEOUT
// httpclient.getParams().setIntParameter("http.socket.timeout", 1);
HttpParams params = httpclient.getParams();
HttpConnectionParams.setConnectionTimeout(params, 5000);
HttpConnectionParams.setSoTimeout(params, 10000);
// HttpHost targetHost = new HttpHost("3g.youku.com");
HttpGet httpget = new HttpGet(url);
// HttpGet httpget = new HttpGet("/");
// 查看默认request头部信息
System.out.println("Accept-Charset:"
+ httpget.getFirstHeader("Accept-Charset"));
// 以下这条如果不加会发现无论你设置Accept-Charset为gbk还是utf-8,他都会默认返回gb2312(本例针对google.cn来说)
httpget.setHeader("User-Agent",
// "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413");
RquestHeader.FIREFOX.valueOf());
// "Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.70");
// 用逗号分隔显示可以同时接受多种编码
httpget.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
httpget.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
httpget
.setHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpget.setHeader("Accept-Encoding", "gzip,deflate");
httpget.setHeader("Keep-Alive", "115");
httpget.setHeader("Connection", "keep-alive");
// 验证头部信息设置生效
System.out.println("Accept-Charset:"
+ httpget.getFirstHeader("Accept-Charset").getValue());
// Execute HTTP request
System.out.println("executing request " + httpget.getURI());
// HttpResponse response = httpclient.execute(targetHost, httpget);
HttpResponse response = httpclient.execute(httpget);
response.setEntity(null);
System.out.println("----------------------------------------");
System.out.println("Location: " + response.getLastHeader("Location"));
System.out.println(response.getStatusLine().getStatusCode());
System.out.println(response.getLastHeader("Content-Type"));
System.out.println(response.getLastHeader("Content-Length"));
System.out.println("----------------------------------------");
// 判断页面返回状态判断是否进行转向抓取新链接
int statusCode = response.getStatusLine().getStatusCode();
System.out.println("statusCode: " + statusCode);
if ((statusCode == HttpStatus.SC_MOVED_PERMANENTLY)
|| (statusCode == HttpStatus.SC_MOVED_TEMPORARILY)
|| (statusCode == HttpStatus.SC_SEE_OTHER)
|| (statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) {
// 此处重定向处理 此处还未验证
String newUri = response.getLastHeader("Location").getValue();
System.out.println("newUri: ".concat(newUri));
httpclient = new DefaultHttpClient();
httpget = new HttpGet(newUri);
response = httpclient.execute(httpget);
}
// Get hold of the response entity
HttpEntity entity = response.getEntity();
// 查看所有返回头部信息
Header headers[] = response.getAllHeaders();
int ii = 0;
while (ii < headers.length) {
System.out.println(headers[ii].getName() + ": "
+ headers[ii].getValue());
++ii;
}
System.out.println("----------------------------------------");
// If the response does not enclose an entity, there is no need
// to bother about connection release
if (entity != null) {
// 将源码流保存在一个byte数组当中,因为可能需要两次用到该流,
byte[] bytes = EntityUtils.toByteArray(entity);
{
//
File file = new File("c:/a.flv");
OutputStream os = new FileOutputStream(file);
os.write(bytes);
os.flush();
os.close();
}
String charSet = "";
// 如果头部Content-Type中包含了编码信息,那么我们可以直接在此处获取
charSet = EntityUtils.getContentCharSet(entity);
System.out.println("In header: " + charSet);
// 如果头部中没有,那么我们需要 查看页面源码,这个方法虽然不能说完全正确,因为有些粗糙的网页编码者没有在页面中写头部编码信息
if (charSet == "" || charSet == null) {
String regEx_html = "(?=<meta).+?(?<=charset=['/"]?)([//w-]+)(?=['/"//s+])";
String regEx_xml = "(?=<//?xml).+?(?<=encoding=['/"]?)([//w-]+)(?=['/"//s+])";
Pattern p_html = Pattern.compile(regEx_html,
Pattern.CASE_INSENSITIVE);
Pattern p_xml = Pattern.compile(regEx_xml,
Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(new String(bytes)); // 默认编码转成字符串,因为我们的匹配中无中文,所以串中可能的乱码对我们没有影响
if (m_html.find())
charSet = m_html.group(1);
else {
Matcher m_xml = p_xml.matcher(new String(bytes)); // 默认编码转成字符串,因为我们的匹配中无中文,所以串中可能的乱码对我们没有影响
if (m_xml.find())
charSet = m_xml.group(1);
else
charSet = "GBK";
}
}
if (charSet.toUpperCase().startsWith("GB")) // 处理汉字编码集过小情况
charSet = "GBK";
System.out.println("Last get: " + charSet);
// 至此,我们可以将原byte数组按照正常编码专成字符串输出(如果找到了编码的话)
String txt = new String(bytes, charSet);
// System.out.println("Encoding string is:/n" + txt);
long end = System.currentTimeMillis();
System.out.println("Cost time is: " + (end - start) / 1000.00
+ " s.");
return txt;
}
httpclient.getConnectionManager().shutdown();
return "";
}
public static void test1(String url) throws Exception {
// (?<=<a).+?href=["']?(.+?)(?=["' >])
Pattern p = Pattern.compile("<a//s+?href=[/"']?(.*?)[/"'//s >]",
Pattern.CASE_INSENSITIVE);
String txt = simpleHttpClient(url);
long b = System.nanoTime();
Matcher m = p.matcher(txt);
Pattern pt = Pattern.compile("^((javascript|mailto):.*)|([#/])$",
Pattern.CASE_INSENSITIVE);
while (m.find()) {
// System.out.println(m.group(1));
String rs = m.group(1);
Matcher mt = pt.matcher(rs);
if (!mt.matches())
System.out.println("--/t"
.concat(isRelativeAddressToFullUrlAddressNew(rs, url)));
}
long e = System.nanoTime() - b;
System.out.println("Cost time is: " + (e / 1000000000.00) + " s.");
}
/**
*
* 处理带"#"情况的URL
*
* @param link
* 需要转换的链接
* @param base
* 页面的url
* @return 绝对 URL.
*/
public static String isRelativeAddressToFullUrlAddressNew(String link,
String base) {
String dealLinkStr = link.trim();
URL url = null;
try {
if (dealLinkStr.startsWith("#"))
url = new URL(base + dealLinkStr);
else
url = isRelativeAddressToFullUrlAddress(link, base);
return url.toString();
} catch (MalformedURLException e) {
// e.printStackTrace();
// logger.debug("连接不合法..... <url> " + link);
return link;
}
}
/**
* Build a URL from the link and base provided.
*
* @param link
* 需要转换的链接
* @param base
* 页面的url
* @return 绝对 URL.
* @exception MalformedURLException
* If creating the URL fails.
*/
public static URL isRelativeAddressToFullUrlAddress(String link, String base)
throws MalformedURLException {
String path;
boolean modified;
boolean absolute;
boolean strict = haveQM(base);
int index;
URL url; // constructed URL combining relative link and base
// Bug #1461473 Relative links starting with ?
if (!strict && ('?' == link.charAt(0))) { // remove query part of base
// if any
if (-1 != (index = base.lastIndexOf('?')))
base = base.substring(0, index);
url = new URL(base + link);
} else {
url = new URL(new URL(base), link);
}
path = url.getFile();
modified = false;
absolute = link.startsWith("/");
if (!absolute) { // we prefer to fix incorrect relative links
// this doesn't fix them all, just the ones at the start
while (path.startsWith("/.")) {
if (path.startsWith("/../")) {
path = path.substring(3);
modified = true;
} else if (path.startsWith("/./") || path.startsWith("/.")) {
path = path.substring(2);
modified = true;
} else
break;
}
}
// fix backslashes
while (-1 != (index = path.indexOf("///"))) {
path = path.substring(0, index + 1) + path.substring(index + 2);
modified = true;
}
if (modified)
url = new URL(url, path);
return (url);
}
private static boolean haveQM(String urlStr) {
if (urlStr.contains("?")) {
return true;
} else {
return false;
}
}
}
/**
*
*各种请求头类型
*
* @author Jerome
*
*/
enum RquestHeader {
Oper {
@Override
public String valueOf() {
return "Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.70";
}
},
IPhone {
@Override
public String valueOf() {
return "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16";
}
},
IE {
@Override
public String valueOf() {
return "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; .NET CLR 2.0.50727)";
}
},
FIREFOX {
@Override
public String valueOf() {
return "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13";
}
},
IE8 {
@Override
public String valueOf() {
return "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0)";
}
};
// 获取请求头内容
public abstract String valueOf();
}
将注释
// httpget.setHeader("Referer",
// "http://www.bhtv.cc/");
打开就可以了