使用HttpClient做的一个简单的网络爬虫

我们有时要想把一个网站下一些有用的信息copy下来,比如下面的例子:想把http://www.ef360.com这个网站下所有的制衣企业拿下来并保存在自己的excel文件中,一个一个copy?受不了,那该怎么办呢?看下面基本源码
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;


public class App {
	//需要分析的网站
	public static final String LOGON_SITE="http://www.ef360.com";
	public static final int LOGON_PORT=80;
	private static String[] desc=new String[]{
		"公司介绍",
		"主营产品",
		"公司名称",
		"所在省市",
		"公司地址",
		"邮编",
		"联系人",
		"联系电话",
		"手机号码",
		"传真",
		"网址",
		"公司网址"
	};
	private static String[] key=new String[]{
		"companyInfo",
		"products",
		"companyName",
		"province",
		"address",
		"zipCode",
		"headerMan",
		"phone",
		"mobilePhone",
		"tax",
		"detailUrl",
		"netUrl"
	};

	/**
	 * 取到所有福建(350000)的厂家信息
	 * @param args
	 */
	public static void main(String[] args) {
		
		HttpClient client=new HttpClient();
		client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);//设置主机地址
		//地区编号
		String areaId="350000";//指福建
		int pages=2;//需要取的页数,可以定的大一点
		//已经添加的过滤
		Map duclipcateCode=new HashMap();
		//每行的结果
		List resultList= new ArrayList();
		System.out.println("=======开始链接,请等待...========");
		long start=System.currentTimeMillis();
		for(int i=1;i<pages;i++){//从第1页开始取,取到pages页结束
//			所有某个企业列表的链接,Province指地区,page指当前第几页
			String baseUrl="http://company.ef360.com/CompanyList.asp?Province="+areaId+"&page="+i;
			//baseUrl+="&page="+i;
			GetMethod get=new GetMethod(baseUrl);
			get.getParams().setContentCharset("gb2312");
			String resposeString="";
			try {
				client.executeMethod(get);
				resposeString=get.getResponseBodyAsString();
				get.releaseConnection();
			} catch (HttpException e) {
				// TODO 自动生成 catch 块
				e.printStackTrace();
			} catch (IOException e) {
				// TODO 自动生成 catch 块
				e.printStackTrace();
			}
			
//			System.out.println("resposeString="+resposeString);
			//查找在这一页里面所有Detail/*.html这样的链接
			Matcher m=Pattern.compile("Detail/.*html").matcher(resposeString);
			while(m.find()){
				//取到201580.html这样的数字
				Matcher m2=Pattern.compile("[0-9]+").matcher(m.group());
				if(m2.find()){
					String findCode=m2.group();
//					System.out.println("findCode="+findCode);
					if(duclipcateCode.get(findCode)!=null){//重复的不要
						continue;
					}
					duclipcateCode.put(findCode, findCode);
					//具体某个企业的链接
					String detailUrl="http://company.ef360.com/Detail/"+findCode+".html";
					Map result=new HashMap();
					System.out.println("开始链接:"+detailUrl);
					GetMethod get2=new GetMethod(detailUrl);
					get2.getParams().setContentCharset("gb2312");
					String content="";
					try {
						client.executeMethod(get2);
						content=get2.getResponseBodyAsString();//获得链接数据
						get2.releaseConnection();
					} catch (HttpException e) {
						// TODO 自动生成 catch 块
						e.printStackTrace();
					} catch (IOException e) {
						// TODO 自动生成 catch 块
						e.printStackTrace();
					}
//					System.out.println("content="+content);
					//处理数据
					String companyInfo=processDetail(content,"公司介绍</div>[\\s\\S]*CompanyTit_Body.*</div>","CompanyTit_Body",null,2,0);
					String products=processDetail(content,"主营产品</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String companyName=processDetail(content,"公司名称</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String province=processDetail(content,"所在省市</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String address=processDetail(content,"公司地址</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String zipCode=processDetail(content,"邮编</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String headerMan=processDetail(content,"联系人</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String phone=processDetail(content,"联系电话</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String mobilePhone=processDetail(content,"手机号码</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String tax=processDetail(content,"联系传真</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					//String detailUrl=processDetail(content,"公司名称</TD>[\\s\\S]InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					String netUrl=processDetail(content,"公司网址</TD>[\\s\\S]*InnerContent_Search.*</TD>","InnerContent_Search","</TD>",3,0);
					result.put("companyInfo", companyInfo);
					result.put("products", products);
					result.put("companyName", companyName);
					result.put("province", province);
					result.put("address", address);
					result.put("zipCode", zipCode);
					result.put("headerMan", headerMan);
					result.put("phone", phone);
					result.put("mobilePhone", mobilePhone);
					result.put("tax", tax);
					result.put("detailUrl", detailUrl);
					result.put("netUrl", netUrl);
					resultList.add(result);
				}
			}
		}
		System.out.println("=======开始写入xls========");
		ExcelUtil.exportExcel(key,desc,resultList,"D:/");
		
		long end=System.currentTimeMillis();
		System.out.println("执行结束,共花费时间:"+(end-start)/1000+"s");
		
	}
	/**
	 * 解析html
	 * @param content
	 * @param tex
	 * @param startFlag
	 * @param endFlag
	 * @param starts
	 * @param ends
	 * @return
	 */
	public static String processDetail(String content,String tex,String startFlag,String endFlag,int starts,int ends){
		Matcher m=Pattern.compile(tex).matcher(content);
		String tempStr="";
		if(m.find()){
			tempStr=m.group();
			tempStr=tempStr.substring(tempStr.indexOf(startFlag)+startFlag.length()+starts, tempStr.length()-ends);
			if(endFlag!=null){
				tempStr=tempStr.substring(0, tempStr.indexOf(endFlag));
			}
			tempStr=tempStr.replaceAll("<br>", "\n");
			tempStr=tempStr.replaceAll("<p>", "\n");
			tempStr=tempStr.replaceAll("</TD>", "");
			tempStr=tempStr.replaceAll("</div>", "");
			tempStr=tempStr.replaceAll("</a>", "");
			tempStr=tempStr.replaceAll("<a href=.*>", "");
		}
		return tempStr;
	}

}



ExcelUtil 工具类,只是为了输出excel
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.List;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFFont;
import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.util.HSSFColor;

public class ExcelUtil {

	/**
	 * 
	 * @param key 标题1
	 * @param desc 标题2
	 * @param list 数据
	 * @param path 保存路径
	 */
	public static void exportExcel(String[] key,String[] desc,List list,String path){
		//POIFSFileSystem fs;
		FileOutputStream out=null;
//		String fileName=new SimpleDateFormat("yyyy_MM_dd_HH_mm_ss").format(new Date())+".xls";
		String fileName="我的Excel.xls";
		HSSFWorkbook workbook=new HSSFWorkbook();
		HSSFSheet sheet=workbook.createSheet();
		
		workbook.setSheetName(0, "信息",HSSFWorkbook.ENCODING_UTF_16);
		HSSFRow keyRow=sheet.createRow((short)0);
		HSSFRow descRow=sheet.createRow((short)1);
		keyRow.setHeight((short) 660);//设置行高
		descRow.setHeight((short) 660);
		//设置字体样式啊什么的
		HSSFCellStyle style=workbook.createCellStyle();
		style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
		style.setVerticalAlignment(HSSFCellStyle.VERTICAL_CENTER);
		style.setFillBackgroundColor(HSSFColor.GREY_25_PERCENT.index);
			HSSFFont font=workbook.createFont();
			font.setColor(HSSFColor.RED.index);
			font.setFontHeight((short)260);
			font.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD);
		style.setFont(font);
		for(int i=0;i<key.length;i++){
			sheet.setColumnWidth((short)i,(short)5000);//设置各列列宽
			//标题1
			HSSFCell cell=keyRow.createCell((short) i);
			cell.setEncoding(HSSFCell.ENCODING_UTF_16);//解决Poi生成excel乱码
			cell.setCellType(HSSFCell.CELL_TYPE_STRING);
			cell.setCellStyle(style);//设置单元格样式
			cell.setCellValue(key[i]);
			//标题2
			HSSFCell cell2=descRow.createCell((short) i);
			cell2.setEncoding(HSSFCell.ENCODING_UTF_16);//解决Poi生成excel乱码
			cell2.setCellType(HSSFCell.CELL_TYPE_STRING);
			cell2.setCellStyle(style);//设置单元格样式
			cell2.setCellValue(desc[i]);
//			cell.setCellValue(new HSSFRichTextString(desc[i]));
		}
		//具体数据,list里存放的是HashMap
		for(int i=0;i<list.size();i++){
			HashMap map=(HashMap)list.get(i);
			HSSFRow row=sheet.createRow((short)i+2);//第3行开始
			for(int j=0;j<key.length;j++){//一行有多少列
				HSSFCell cell=row.createCell((short) j);
				Object value=map.get(key[j]);//根据key取出对应的值
				if(value==null) value="";
				
				if(value instanceof Double || value instanceof Integer){
					double d=Double.parseDouble(value.toString());
					cell.setEncoding(HSSFCell.ENCODING_UTF_16);
					cell.setCellType(HSSFCell.CELL_TYPE_NUMERIC);
					cell.setCellValue(d);
				}else if(value instanceof String){
					cell.setEncoding(HSSFCell.ENCODING_UTF_16);
					cell.setCellType(HSSFCell.CELL_TYPE_STRING);
					cell.setCellValue(value.toString());
				}else if(value instanceof Date){
					String date=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(value);
					cell.setEncoding(HSSFCell.ENCODING_UTF_16);
					cell.setCellType(HSSFCell.CELL_TYPE_STRING);
					cell.setCellValue(date);
				}else{
//					cell.setCellValue(new HSSFRichTextString(value));
//                    cell.setCellType(HSSFCell.CELL_TYPE_STRING);
				}
			}
		}
		
		try {
			out=new FileOutputStream(path+fileName);
			workbook.write(out);
			out.flush();
		} catch (FileNotFoundException e) {
			// TODO 自动生成 catch 块
			e.printStackTrace();
		}catch (IOException e) {
			// TODO 自动生成 catch 块
			e.printStackTrace();
		}finally{
			try {
				out.close();
			} catch (IOException e) {
				// TODO 自动生成 catch 块
				e.printStackTrace();
			}
		}
		
	}
}


下面是2个用到的包,可能还需要apache的commons-XXX.jar包

你可能感兴趣的:(apache,Excel,J#)