抓取数据设置cookie

今天,遇到一个问题,顺便把它写下来。今天在抓取一个网站的时候,看起来像一个简单的页面,人工

浏览的话,是完全没问题,一旦有程序开始抓取,问题就来了。老提示我链接错误。一开始,一头冒烟,为啥呢?

细心想了一下,难道是cookie做怪,好,那就找一下我以前用cookie提交访问页面的程序,结果不知道放到哪里去了。

花了差不多两个小时,找到了一份源代码。下面是我修改过的程序

package org.qichao.mode;

import java.io.*;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.*;
import org.apache.commons.httpclient.params.HttpMethodParams;

public class UR {
 

     public static void main(String[] args) {
    	 HttpClient httpClient = new HttpClient();

     httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

                // 创建GET方法的实例

         GetMethod getMethod = new GetMethod("http://www.51ys.com/See_Url_one.asp?operator=25041782C95478FEE686A09");

      getMethod.setRequestHeader("Host","cards.360114.com");

      getMethod.setRequestHeader("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.8.1.20) Gecko/20081217 Firefox/2.0.0.20");

      getMethod.setRequestHeader("Accept","text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5");

      getMethod.setRequestHeader("Accept-Language","zh-cn,zh;q=0.5");

      getMethod.setRequestHeader("Accept-Encoding","gzip,deflate");

      getMethod.setRequestHeader("Accept-Charset","gb2312,utf-8;q=0.7,*;q=0.7");

      getMethod.setRequestHeader("Keep-Alive","300");

      getMethod.setRequestHeader("Connection","keep-alive");

      getMethod.setRequestHeader("Referer","http://www.360114.com/yellowpage/query.asp?Call=77&h1=GSLANVG&Spara=3&Cpara=&h2=HSIFJTNJHIH&Tpara=&h3=EDJYLUE&h5=@GAXBXFR@R@&scall=");

      getMethod.setRequestHeader("Cookie","__utmz=76121879.1230526182.3.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=%E4%BC%81%E4%B8%9A%E9%BB%84%E9%A1%B5%E5%A4%A7%E5%85%A8; __utma=76121879.2444684742963329000.1230517736.1230526182.1230530122.4; __utmc=76121879; ASPSESSIONIDAAATASRQ=IFDOECBAHDBKJFKKMKDOEFCP");

  getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,

                              new DefaultHttpMethodRetryHandler());

                try {

                       // 执行getMethod

                       int statusCode = httpClient.executeMethod(getMethod);

                       if (statusCode != HttpStatus.SC_OK) {

                              System.err.println("Method failed: "

                                            + getMethod.getStatusLine());

                       }

                       // 读取内容                        

                       byte[] responseBody = getMethod.getResponseBody();                

                          String content = new String(responseBody);

                          

                            // 处理内容

                        System.out.println(new String(responseBody));

                } catch (HttpException e) {

                       // 发生致命的异常,可能是协议不对或者返回的内容有问题

                       System.out.println("Please check your provided http address!");

                       e.printStackTrace();

                } catch (IOException e) {

                       // 发生网络异常

                       e.printStackTrace();

                } finally {

                       // 释放连接

                //     getMethod.releaseConnection();
                }
     }
 }
 

你可能感兴趣的:(apache,windows,xml,asp,firefox)