理解主题爬虫源代码

//网络爬虫:理解主题爬虫
//指定抓取机票价格的例子
package com;


import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.visitors.NodeVisitor;




public class RetrivePage {
private static HttpClient httpCLient = new HttpClient();

//设置代理服务器
static{
//设置代理服务器的IP和端口
//htttpClient.getHostConfiguration().setProxy("172. 17. 18 84", 8080);
}

public static boolean downloadPage(String path) throws HttpException, IOException, Exception{
InputStream input = null;
OutputStream output = null;

//得到get方法
GetMethod getMethod = new GetMethod();

//设置get方法的参数
NameValuePair[] postData = new NameValuePair[8];
postData[0] = new NameValuePair("DCity1", "BJS");
postData[1] = new NameValuePair("ACity1", "SHA");
postData[2] = new NameValuePair("DDate1", "2010-1-31");
postData[3] = new NameValuePair("ClassType", "");
postData[4] = new NameValuePair("PassengerQuantity", "1");
postData[5] = new NameValuePair("SendTicketCity", "%u5317%u4EAC");
postData[6] = new NameValuePair("Airline", "");
postData[7] = new NameValuePair("PassengerType", "ADU");
getMethod.setQueryString(postData);
 
//执行
int statusCode = httpCLient.executeMethod(getMethod);
 
//针对状态码进行处(简单起见,只处理返回值为200的状态码)
if(statusCode == HttpStatus.SC_OK){
//获得返回值的内容
input = getMethod.getResponseBodyAsStream();
String charset = getMethod.getRequestCharSet();
 
//得到文件名
String filename = path.substring(path.lastIndexOf('/') + 1);
filename += System.currentTimeMillis();
 
//获得文件输出流
File tempFile = new File(filename);
if(!tempFile.exists()){
tempFile.createNewFile();
}
output = new FileOutputStream(tempFile);
 
//输出到文件
int tempByte = -1;
while((tempByte = input.read()) > 0)
{
output.write(tempByte);
}
 
//关闭输入输出流
if(input != null)
{
input.close();
}
 
if(output != null){
output.close();
}
 
//使用htmlpaser解析
Parser parser = new Parser();
parser.setEncoding(charset);
NodeVisitor nodeVisitor = new NodeVisitor(){
private boolean flag = false;
private int index = -1;
public void visitTag(Tag tag){
if(tag.getTagName().equals("TBODY")){
System.out.println("begin.....");
flag = true;
index = 0;
}
if(tag.getTagName().equals("TD") && flag){
 
switch(index){
case 0:
System.out.println("from_to:" + tag.toPlainTextString().trim());
break;
case 1:
System.out.println("carrier:" + tag.toPlainTextString().trim());
break;
case 2:
System.out.println("type:" + tag.toPlainTextString().trim());
break;
case 3:
System.out.println("number:" + tag.toPlainTextString().trim());
break;
case 4:
System.out.println("dicount:" + tag.toPlainTextString().trim());
break;
case 5:
System.out.println("price:" + tag.toPlainTextString().trim());
break;
}
index++;
}
}
 
public void visitEndTag(Tag tag){
if(tag.getTagName().equals("TBODY")){
System.out.println("end........");
flag = false;
}
}
};
 
parser.visitAllNodesWith(nodeVisitor);
return true;
 
}
 
 

return false;
}


/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
//输出lietu首页
try{
RetrivePage.downloadPage("http://www.CSDN.com");

}catch(HttpException e)
{
e.printStackTrace();
}catch(IOException e){
e.printStackTrace();
}catch(Exception e){
e.printStackTrace();
}


}


}

你可能感兴趣的:(理解主题爬虫源代码)