HttpURLConnection 和HttpClient+Jsoup处理标签抓取页面和模拟登录
博客分类: httpclient
HttpURLConnectionHttpClientJsoup
HttpURLConnection抓取
Java代码 收藏代码
package com.app.html;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
public class Html {
private static final String loginURL = "http://login.goodjobs.cn/index.php/action/UserLogin";
private static final String forwardURL = "http://user.goodjobs.cn/dispatcher.php/module/Personal/?skip_fill=1";
/**
* 获取登录页面请求
* @param loginUrl登录URL
* @param params登录用户名/密码参数
* @throws Exception
*/
public static String createHtml(String...params)throws Exception{
URL url = new URL(loginURL);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setDoOutput(true);
loginHtml(conn, params);
return forwardHtml(conn,url);
}
/**
* 登录页面
* @param conn
* @param params登录用户名/密码参数
* @throws Exception
*/
private static void loginHtml(HttpURLConnection conn, String... params)
throws Exception {
OutputStreamWriter out = new OutputStreamWriter(conn.getOutputStream(), "GBK");
StringBuffer buff=new StringBuffer();
buff.append("memberName="+URLEncoder.encode(params[0], "UTF-8"));//页面用户名
buff.append("&password="+URLEncoder.encode(params[1],"UTF-8"));//页面密码
out.write(buff.toString());//填充参数
out.flush();
out.close();
}
/**
* 转向到定向的页面
* @param conn连接对象
* @param url重新定向请求URL
* @param toUrl定向到页面请求URL
* @throws Exception
*/
public static String forwardHtml(HttpURLConnection conn,URL url)throws Exception{
//重新打开一个连接
String cookieVal = conn.getHeaderField("Set-Cookie");
url = new URL(forwardURL);
conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Foxy/1; .NET CLR 2.0.50727;MEGAUPLOAD 1.0)");
conn.setFollowRedirects(false);//置此类是否应该自动执行 HTTP 重定向
// 取得cookie,相当于记录了身份,供下次访问时使用
if (cookieVal != null) {
//发送cookie信息上去,以表明自己的身份,否则会被认为没有权限
conn.setRequestProperty("Cookie", cookieVal);
}
conn.connect();
InputStream in = conn.getInputStream();
BufferedReader buffReader = new BufferedReader( new InputStreamReader(in,"GBK"));
String line = null;
String content = "";
while ((line = buffReader.readLine()) != null) {
content +="\n" +line;
}
//IOUtils.write(result, new FileOutputStream("d:/index.html"),"GBK");
write(content, "d:/forward.html");
buffReader.close();
return content;
}
/**
*
* @param content
* @param htmlPath
* @return
*/
public static boolean write(String content, String htmlPath) {
boolean flag = true;
try {
Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(htmlPath), "GBK"));
out.write("\n" + content);
out.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
return false;
} catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
return false;
} catch (IOException ex) {
ex.printStackTrace();
return false;
}
return flag;
}
public static void main(String[] args)throws Exception{
String [] params={"admin","admin12"};
System.out.println(createHtml(params));
}
}
HttpClient抓取页面 未处理样式的
Java代码 收藏代码
package com.app.html;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.cookie.CookieSpec;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class HttpClientHtml {
private static final String SITE = "login.goodjobs.cn";
private static final int PORT = 80;
private static final String loginAction = "/index.php/action/UserLogin";
private static final String forwardURL = "http://user.goodjobs.cn/dispatcher.php/module/Personal/?skip_fill=1";
/**
* 模拟等录
* @param LOGON_SITE
* @param LOGON_PORT
* @param login_Action
* @param params
* @throws Exception
*/
private static HttpClient loginHtml(String LOGON_SITE, int LOGON_PORT,String login_Action,String ...params) throws Exception {
HttpClient client = new HttpClient();
client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);
// 模拟登录页面
PostMethod post = new PostMethod(login_Action);
NameValuePair userName = new NameValuePair("memberName",params[0] );
NameValuePair password = new NameValuePair("password",params[1] );
post.setRequestBody(new NameValuePair[] { userName, password });
client.executeMethod(post);
post.releaseConnection();
// 查看cookie信息
CookieSpec cookiespec = CookiePolicy.getDefaultSpec();
Cookie[] cookies = cookiespec.match(LOGON_SITE, LOGON_PORT, "/", false,
client.getState().getCookies());
if (cookies != null)
if (cookies.length == 0) {
System.out.println("Cookies is not Exists ");
} else {
for (int i = 0; i < cookies.length; i++) {
System.out.println(cookies[i].toString());
}
}
return client;
}
/**
* 模拟等录 后获取所需要的页面
* @param client
* @param newUrl
* @throws Exception
*/
private static void createHtml(HttpClient client, String newUrl)
throws Exception {
PostMethod post = new PostMethod(newUrl);
client.executeMethod(post);
post.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "GBK");
String content= post.getResponseBodyAsString();
SimpleDateFormat format=new SimpleDateFormat("yyyy-MM-dd");
//IOUtils.write(content, new FileOutputStream("d:/"+format.format(new Date())+".html"),"GBK");
write(content,"d:/"+format.format(new Date())+".html");
post.releaseConnection();
}
public static void main(String[] args) throws Exception {
String [] params={"admin","admin123"};
HttpClient client = loginHtml(SITE, PORT, loginAction,params);
// 访问所需的页面
createHtml(client, forwardURL);
//System.out.println(UUID.randomUUID());
}
/**
*
* @param content
* @param htmlPath
* @return
*/
public static boolean write(String content, String htmlPath) {
boolean flag = true;
try {
Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(htmlPath), "GBK"));
out.write("\n" + content);
out.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
return false;
} catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
return false;
} catch (IOException ex) {
ex.printStackTrace();
return false;
}
return flag;
}
}
HttpClient抓取页面处理样式的页面效果(连接服务器站点的css)
Java代码 收藏代码
package com.app.html;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.cookie.CookieSpec;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.app.comom.FileUtil;
public class HttpClientHtml {
private static final String SITE = "login.goodjobs.cn";
private static final int PORT = 80;
private static final String loginAction = "/index.php/action/UserLogin";
private static final String forwardURL = "http://user.goodjobs.cn/dispatcher.php/module/Personal/?skip_fill=1";
private static final String toUrl = "d:\\test\\";
private static final String css = "http://user.goodjobs.cn/personal.css";
private static final String Img = "http://user.goodjobs.cn/images";
private static final String _JS = "http://user.goodjobs.cn/scripts/fValidate/fValidate.one.js";
/**
* 模拟等录
* @param LOGON_SITE
* @param LOGON_PORT
* @param login_Action
* @param params
* @throws Exception
*/
private static HttpClient loginHtml(String LOGON_SITE, int LOGON_PORT,String login_Action,String ...params) throws Exception {
HttpClient client = new HttpClient();
client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);
// 模拟登录页面
PostMethod post = new PostMethod(login_Action);
NameValuePair userName = new NameValuePair("memberName",params[0] );
NameValuePair password = new NameValuePair("password",params[1] );
post.setRequestBody(new NameValuePair[] { userName, password });
client.executeMethod(post);
post.releaseConnection();
// 查看cookie信息
CookieSpec cookiespec = CookiePolicy.getDefaultSpec();
Cookie[] cookies = cookiespec.match(LOGON_SITE, LOGON_PORT, "/", false,
client.getState().getCookies());
if (cookies != null)
if (cookies.length == 0) {
System.out.println("Cookies is not Exists ");
} else {
for (int i = 0; i < cookies.length; i++) {
System.out.println(cookies[i].toString());
}
}
return client;
}
/**
* 模拟等录 后获取所需要的页面
* @param client
* @param newUrl
* @throws Exception
*/
private static String createHtml(HttpClient client, String newUrl) throws Exception {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
String filePath = toUrl + format.format(new Date() )+ "_" + 1 + ".html";
PostMethod post = new PostMethod(newUrl);
client.executeMethod(post);
//设置编码
post.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "GBK");
String content= post.getResponseBodyAsString();
FileUtil.write(content, filePath);
System.out.println("\n写入文件成功!");
post.releaseConnection();
return filePath;
}
/**
* 解析html代码
* @param filePath
* @param random
* @return
*/
private static String JsoupFile(String filePath, int random) {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
File infile = new File(filePath);
String url = toUrl + format.format(new Date()) + "_new_" + random+ ".html";
try {
File outFile = new File(url);
Document doc = Jsoup.parse(infile, "GBK");
String html="";
StringBuffer sb = new StringBuffer();
sb.append(html).append("\n");
sb.append("").append("\n");
sb.append("").append("\n");
sb.append("欢迎使用新安人才网个人专区 ").append("\n");
Elements meta = doc.getElementsByTag("meta");
sb.append(meta.toString()).append("\n");
////////////////////////////body//////////////////////////
Elements body = doc.getElementsByTag("body");
////////////////////////////link//////////////////////////
Elements links = doc.select("link");//对link标签有href的路径都作处理
for (Element link : links) {
String hrefAttr = link.attr("href");
if (hrefAttr.contains("/personal.css")) {
hrefAttr = hrefAttr.replace("/personal.css",css);
Element hrefVal=link.attr("href", hrefAttr);//修改href的属性值
sb.append(hrefVal.toString()).append("\n");
}
}
////////////////////////////script//////////////////////////
Elements scripts = doc.select("script");//对script标签
for (Element js : scripts) {
String jsrc = js.attr("src");
if (jsrc.contains("/fValidate.one.js")) {
String oldJS="/scripts/fValidate/fValidate.one.js";//之前的css
jsrc = jsrc.replace(oldJS,_JS);
Element val=js.attr("src", jsrc);//修改href的属性值
sb.append(val.toString()).append("\n").append("");
}
}
////////////////////////////script//////////////////////////
Elements tags = body.select("*");//对所有标签有src的路径都作处理
for (Element tag : tags) {
String src = tag.attr("src");
if (src.contains("/images")) {
src = src.replace("/images",Img);
tag.attr("src", src);//修改src的属性值
}
}
sb.append(body.toString());
sb.append("");
BufferedReader in = new BufferedReader(new FileReader(infile));
Writer out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(outFile), "gbk"));
String content = sb.toString();
out.write(content);
in.close();
System.out.println("页面已经爬完");
out.close();
} catch (IOException e) {
e.printStackTrace();
}
return url;
}
public static void main(String[] args) throws Exception {
String [] params={"admin","admin123"};
HttpClient client = loginHtml(SITE, PORT, loginAction,params);
// 访问所需的页面
String path=createHtml(client, forwardURL);
System.out.println( JsoupFile(path,1));
}
}
HttpClient抓取页面处理样式的页面效果(从网站下载以txt格式文件写入html处理的css)
Java代码 收藏代码
package com.app.html;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.cookie.CookieSpec;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.app.comom.FileUtil;
public class HttpClientHtml {
private static final String SITE = "login.goodjobs.cn";
private static final int PORT = 80;
private static final String loginAction = "/index.php/action/UserLogin";
private static final String forwardURL = "http://user.goodjobs.cn/dispatcher.php/module/Personal/?skip_fill=1";
private static final String toUrl = "d:\\test\\";
private static final String hostCss = "d:\\test\\style.txt";
private static final String Img = "http://user.goodjobs.cn/images";
private static final String _JS = "http://user.goodjobs.cn/scripts/fValidate/fValidate.one.js";
/**
* 模拟等录
* @param LOGON_SITE
* @param LOGON_PORT
* @param login_Action
* @param params
* @throws Exception
*/
private static HttpClient loginHtml(String LOGON_SITE, int LOGON_PORT,String login_Action,String ...params) throws Exception {
HttpClient client = new HttpClient();
client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);
// 模拟登录页面
PostMethod post = new PostMethod(login_Action);
NameValuePair userName = new NameValuePair("memberName",params[0] );
NameValuePair password = new NameValuePair("password",params[1] );
post.setRequestBody(new NameValuePair[] { userName, password });
client.executeMethod(post);
post.releaseConnection();
// 查看cookie信息
CookieSpec cookiespec = CookiePolicy.getDefaultSpec();
Cookie[] cookies = cookiespec.match(LOGON_SITE, LOGON_PORT, "/", false,
client.getState().getCookies());
if (cookies != null)
if (cookies.length == 0) {
System.out.println("Cookies is not Exists ");
} else {
for (int i = 0; i < cookies.length; i++) {
System.out.println(cookies[i].toString());
}
}
return client;
}
/**
* 模拟等录 后获取所需要的页面
* @param client
* @param newUrl
* @throws Exception
*/
private static String createHtml(HttpClient client, String newUrl) throws Exception {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
String filePath = toUrl + format.format(new Date() )+ "_" + 1 + ".html";
PostMethod post = new PostMethod(newUrl);
client.executeMethod(post);
//设置编码
post.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "GBK");
String content= post.getResponseBodyAsString();
FileUtil.write(content, filePath);
System.out.println("\n写入文件成功!");
post.releaseConnection();
return filePath;
}
/**
* 解析html代码
* @param filePath
* @param random
* @return
*/
private static String JsoupFile(String filePath, int random) {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
File infile = new File(filePath);
String url = toUrl + format.format(new Date()) + "_new_" + random+ ".html";
try {
File outFile = new File(url);
Document doc = Jsoup.parse(infile, "GBK");
String html="";
StringBuffer sb = new StringBuffer();
sb.append(html).append("\n");
sb.append("").append("\n");
sb.append("").append("\n");
sb.append("欢迎使用新安人才网个人专区 ").append("\n");
Elements meta = doc.getElementsByTag("meta");
sb.append(meta.toString()).append("\n");
/////////////////////////////本地css////////////////////////////
File cssFile = new File(hostCss);
BufferedReader in = new BufferedReader(new FileReader(cssFile));
Writer out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(outFile), "gbk"));
String content=in.readLine();
while(content!=null){
//System.out.println(content);
sb.append(content+"\n");
content=in.readLine();
}
in.close();
////////////////////////////处理body标签//////////////////////////
Elements body = doc.getElementsByTag("body");
////////////////////////////处理script标签//////////////////////////
Elements scripts = doc.select("script");//对script标签
for (Element js : scripts) {
String jsrc = js.attr("src");
if (jsrc.contains("/fValidate.one.js")) {
String oldJS="/scripts/fValidate/fValidate.one.js";//之前的css
jsrc = jsrc.replace(oldJS,_JS);
Element val=js.attr("src", jsrc);//修改href的属性值
sb.append(val.toString()).append("\n").append("");
}
}
////////////////////////////处理所有src的属性值//////////////////////////
Elements tags = body.select("*");//对所有标签有src的路径都作处理
for (Element tag : tags) {
String src = tag.attr("src");
if (src.contains("/images")) {
src = src.replace("/images",Img);
tag.attr("src", src);//修改src的属性值
}
}
sb.append(body.toString());
sb.append("");
out.write(sb.toString());
in.close();
System.out.println("页面已经爬完");
out.close();
} catch (IOException e) {
e.printStackTrace();
}
return url;
}
public static void main(String[] args) throws Exception {
String [] params={"admin","admin123"};
HttpClient client = loginHtml(SITE, PORT, loginAction,params);
// 页面生成
String path=createHtml(client, forwardURL);
System.out.println( JsoupFile(path,1));
}
}