java-爬虫部分:关于京东模拟登陆的两种实现

最近要做一个爬虫,需要网站数据,先拿京东开刀。


因为我是java开发的,所以最开始的时候,想到了httpClient和htmlunit两个东东,于是开始做实验。


网上很久以前流传着一个登陆人人网的例子,我就拿过来照搬了一下,发现不灵,后来才发现是自己没理解人家的精髓。然后用htmlunit去模拟,发现京东的js比较复杂,一位多年爬虫经验的哥们告诉我说htmlunit对js支持的不好,有些网站就是不灵的。没办法,自己想吧。


(1)打开京东的登陆页面,看他的源码,发现是执行了一个ajax,具体链接是:https://passport.jd.com/uc/loginService?uuid=f5c0dd5a-762c-4230-b8c0-f70589b7dbdb&ReturnUrl=http://order.jd.com/center/list.action&r=0.66408410689742&loginname=username&nloginpwd=xxxxxx&loginpwd=xxxxxx&machineNet=&machineCpu=&machineDisk=&authcode=&saHrhnkIIX=GXgVo


每次刷新页面,uuid和最后一个参数都是不一样的。然后在火狐打开登陆页,把参数拼在一起后,直接访问火狐,没问题,登陆成功;但是在火狐打开登陆页,把参数拼起来后,在IE却不能打开。OK,看来是在cookie里存了一些东西后面做验证了。


基于以上分析,做了第一套代码:


核心代码如下:


package com.lkb.test;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;


import org.apache.http.HttpResponse;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.message.BufferedHeader;
import org.apache.http.protocol.HTTP;


public class JD {
    // The configuration items
    private static String userName = "xxx";
    private static String password = "yyy";
    
    private static String redirectURL = "http://order.jd.com/center/list.action";
    private static String loginUrl = "http://passport.jd.com/uc/login";
    // Don't change the following URL
    private static String renRenLoginURL = "https://passport.jd.com/uc/loginService";


    // The HttpClient is used in one session
    private HttpResponse response;
    private DefaultHttpClient httpclient = new DefaultHttpClient();


    public  Map getParams(){
     Map map = new HashMap();
     String str = getText(loginUrl);
     String strs1[] = str.split("name=\"uuid\" value=\"");
     String strs2[] = strs1[1].split("\"/>");
     String uuid = strs2[0];
     map.put("uuid", uuid);
     System.out.println(strs2[0]);
     String str3s[] = strs1[1].split("      String strs4[] = str3s[1].split("/>");
     String strs5[] = strs4[0].trim().split("\"");
     String key = strs5[0];
     String value = strs5[2];
     map.put(key, value);
     return map;
    }
    private boolean login() {
     Map map = getParams();
     
        HttpPost httpost = new HttpPost(renRenLoginURL);
        // All the parameters post to the web site
        List nvps = new ArrayList();
        nvps.add(new BasicNameValuePair("ReturnUrl", redirectURL));
        nvps.add(new BasicNameValuePair("loginname", userName));
        nvps.add(new BasicNameValuePair("nloginpwd", password));
        nvps.add(new BasicNameValuePair("loginpwd", password)); 
        Iterator it = map.keySet().iterator();
        while(it.hasNext()) { 
         String key = it.next().toString();
         String value = map.get(key).toString();
         nvps.add(new BasicNameValuePair(key, value)); 
         
        }
             
        try {
            httpost.setEntity(new UrlEncodedFormEntity((List) nvps, HTTP.UTF_8));
            response = httpclient.execute(httpost);
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        } finally {
            httpost.abort();
        }
        return true;
    }


    private String getRedirectLocation() {
     BufferedHeader locationHeader =  (BufferedHeader) response.getFirstHeader("Location");
        if (locationHeader == null) {
            return null;
        }
        return locationHeader.getValue();
    }


    private String getText(String redirectLocation) {
        HttpGet httpget = new HttpGet(redirectLocation);
        ResponseHandler responseHandler = new BasicResponseHandler();
        String responseBody = "";
        try {
            responseBody = httpclient.execute(httpget, responseHandler);
        } catch (Exception e) {
            e.printStackTrace();
            responseBody = null;
        } finally {
            httpget.abort();
            //httpclient.getConnectionManager().shutdown();
        }
        return responseBody;
    }


    public void printText() {
        if (login()) {        
         System.out.println(getText(redirectURL));
           String redirectLocation = getRedirectLocation();
            if (redirectLocation != null) {
                System.out.println(getText(redirectLocation));
            }
        }
    }


    public static void main(String[] args) {
          JD renRen = new JD();
          //renRen.getParams();
          renRen.printText();
    }
}

验证码解决:

/*
* 取得验证码图片
*/
public File getMarkFile(DefaultHttpClient httpclient,String check,final String picName) {
ResponseHandler responseHandler = new ResponseHandler() {
@Override
public File handleResponse(final HttpResponse response)
throws ClientProtocolException, IOException {
// TODO Auto-generated method stub
int status = response.getStatusLine().getStatusCode();
if (status >= 200 && status < 300) {
String tmpPath = System.getProperty("user.home")
+ File.separator + ".jd";
File pngf = new File(tmpPath);
if (!(pngf.exists() || pngf.isDirectory())) {
pngf.mkdirs();
}
java.text.DateFormat format2 = new java.text.SimpleDateFormat("yyyyMMdd");
   String path = format2.format(new Date());
   String authcodePath = InfoUtil.getInstance().getInfo("road", "authcodePath");
String filePath = authcodePath
+ path;
   File file= new File(filePath);
 //判断文件夹是否存在,如果不存在则创建文件夹
if (!file.exists()) {
  file.mkdir();
 }

String fullpath = filePath +"\\" +picName;


HttpEntity entity = response.getEntity();
return entity != null ? filePutContents(fullpath,
entity.getContent()) : null;
} else {
throw new ClientProtocolException(
"Unexpected response status:" + status);
}
}
};
HttpGet httpGet = new HttpGet(check);
File f = null;
try {
f = httpclient.execute(httpGet, responseHandler);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
httpGet.abort();
}
return f;
}


/*
* png存储为file
*/
private File filePutContents(String fileName, InputStream is) {
File file = new File(fileName);
OutputStream os = null;
try {
os = new FileOutputStream(file);
byte buffer[] = new byte[4 * 1024];
int len = 0;
while ((len = is.read(buffer)) != -1) {
os.write(buffer, 0, len);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
os.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return file;
}

 


(2)后来在实践的过程又在想,如果每个网站都这么复杂,如果人家要是改了实现方式怎么办,于是又找到了selenuim2,发现这个东东是个好东东,可以实现模拟登陆,但是有一个缺点是要弹出页面,因为刚开始试验这个,所以还不熟悉。还有一点是你的操作需要设置sleep时间,不然会出问题。关于这一点还需要大家帮我改进一下,核心代码如下:


package com.lkb;


import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriver.Navigation;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;


public class JDTest {
 
    public static void main(String[] args) {
     JDTest jd = new JDTest();
     jd.connection();
  
  }
    public void connection(){
     WebDriver driver = new FirefoxDriver();
     waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  Navigation navigation = driver.navigate();
  navigation.to("https://passport.360buy.com/new/login.aspx");
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  WebElement loginName = driver.findElement(By.id("loginname"));
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  loginName.sendKeys(Constant.USERNAME);
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  WebElement loginPwd = driver.findElement(By.id("nloginpwd"));
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  loginPwd.sendKeys(Constant.password);
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();




  WebElement loginButton = driver.findElement(By.id("loginsubmit"));
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  waitForSecond();
  loginButton.click();
  waitForSecond();


  navigation.to("http://order.jd.com/center/list.action");
  System.out.println(driver.getPageSource());
  //driver.close();
    }
    
    public void waitForSecond()  
    {  
            try  
           {  
                  Thread. sleep(1000);  
           }  
            catch (InterruptedException e)  
           {  
                  e.printStackTrace();  
           }  
    }  
    
}




 


以上的jar包和源码大家需要的话,可以联系我,QQ:369768231


对爬虫感兴趣的同学,请加我的Q群:101526096


后续还要做验证码的解决方案,有做过或者即将做的,也请加入Q群,一起讨论下。


开源才能进步,希望大家互相帮助,互相进步。

如果大家想找工作的话,可以联系我哈。我们公司招人,需要两种人

(1)技术大牛,我们的架构是hbase+hadoop那一套,以及自定义爬虫,所以需要牛逼的人进来解决各种问题,如果你觉得自己很牛逼,请联系我!

(2)java基础好,为人实在,热爱工作,对自己负责任,有创业精神的人。也可以联系我,我们这边有华尔街大牛,谷歌大牛,北大清华各种博士,对你自己是一个很好地提升!


办公地点在:北京中关村微软大厦。

公司网站:www.quantgroup.cn




你可能感兴趣的:(爬虫)