思路:通过淘宝搜索页链接,到达每个商品页;用正则表达式匹配出买过商品的用户名,并处理分页
大家看个意思吧,程序能运行,但taobao的购买者列表已经改成js调用了,没法直接采了。。
大家看个意思吧,程序能运行,但taobao的购买者列表已经改成js调用了,没法直接采了。。
- package org.jason.web.spider.tabao;
- import java.io.BufferedReader;
- import java.io.InputStreamReader;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import java.net.URLConnection;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class UserSpider {
- //下一页,区域正则表达式
- private static final String NEXT_PAGE=" <a href=\"(.*?)\" class=\"page-next\"><span>下一页</span></a>";
- private static final String LINK = " <h3 class=\"summary\"><a href=\"(.*?)\" target=_blank onclick=\"(.*?)\" class=\"EventCanSelect\">(.*?)</a></h3>";
- private static final String USER = " <a href=\"http://space.taobao.com/(.*?)/portal/personal_portal.htm\" target=\"_blank\">(.*?)</a>";
- private static final String PAR= "?bid_page=1&page_size=100&is_start=true" ;
- private static final int I_BREAK = 2;
- /**
- * @param args
- */
- public static void main(String[] args) {
- Map<String,String> map = new HashMap<String,String>();
- String s="http://search1.taobao.com/browse/0/n-g,nfyg6za----------------40--commend-0-all-0.htm?at_topsearch=1&ssid=e-s5";
- List<String> l = getLinks(s);
- System.out.println("总链接数:"+l.size());
- for(String m : l)
- {
- List<String> o = getPages(m);
- System.out.println("页面数:"+o.size());
- for(String page : o)
- {
- List<String> u = getUsers(page);
- System.out.println("用户数:"+u.size());
- for(String user : u){
- if(map.get(user)==null){
- map.put(user, user);
- }
- }
- }
- }
- for(Map.Entry<String, String> entryTemp : map.entrySet()){
- System.out.println(entryTemp.getKey());
- //保存用户...
- }
- }
- private static List<String> getPages(String sUrl)
- {
- List<String> m = new ArrayList<String>();
- String sText = readURL(sUrl,false,System.getProperty("line.separator"));
- m = replaceAll(sText,LINK,1);
- return m;
- }
- /**
- * 取得页面内用户名
- * @param sUrl
- * @return
- */
- private static List<String> getUsers(String sUrl)
- {
- List<String> m = new ArrayList<String>();
- sUrl= sUrl+PAR;
- //System.out.println("页面:"+sUrl);
- String sText = readURL(sUrl,false,System.getProperty("line.separator"));
- m = replaceAll(sText,USER,2);
- return m;
- }
- /**
- * 取得所有分页链接
- * @param baseUrl
- * @return
- */
- private static List<String> getLinks(String baseUrl)
- {
- List<String> s =new ArrayList<String>();
- if ("".equals(baseUrl) || baseUrl==null){
- return s;
- }
- String cur_url = baseUrl;
- s.add(cur_url);
- int i=0;
- String sText= "";
- while(true)
- {
- sText = readURL(cur_url,false,System.getProperty("line.separator"));
- if("".equals(sText))
- {
- break;
- }
- cur_url="";
- cur_url = replace(sText,NEXT_PAGE,1);
- if("".equals(cur_url))
- {
- break;
- }
- s.add(cur_url);
- //防止死循环
- i++;
- if(i>I_BREAK)
- {
- break;
- }
- }
- return s;
- }
- // 得到正则表达式,所匹配的内容
- public static String replace(String str, String pattern, int place)
- {
- String result = "";
- if (str==null || "".equals(str))
- return result;
- else
- {
- try
- {
- Pattern p = compile(pattern, 2);
- Matcher m = p.matcher(str);
- if (m.find())
- result = m.group(place);
- }
- catch (Exception ex)
- {
- ex.printStackTrace();
- }
- return result;
- }
- }
- public static List<String> replaceAll(String str, String pattern,int i)
- {
- List<String> result = new ArrayList <String>();
- if (str==null || "".equals(str))
- return result;
- else
- {
- try
- {
- Pattern p = Pattern.compile(pattern);
- Matcher m = p.matcher(str);
- while (m.find()){
- result.add(m.group(i));
- }
- }
- catch (Exception ex)
- {
- ex.printStackTrace();
- }
- return result;
- }
- }
- public static Pattern compile(String pattern, int mode)
- {
- return Pattern.compile(pattern, mode);
- }
- public static String readURL(String url, boolean isPost, String line)
- {
- BufferedReader bufferedReader;
- StringBuffer sBuffer = new StringBuffer();
- try
- {
- URL urlPath = new URL(url);
- URLConnection urlConnection = urlPath.openConnection();
- HttpURLConnection httpURL = (HttpURLConnection) urlConnection;
- if (isPost) httpURL.setRequestMethod("POST");
- try
- {
- httpURL.connect();
- // System.out.println("内容类型: "+httpURL.getContentType());
- // System.out.println("内容编码: "+httpURL.getContentEncoding());
- // System.out.println("内容长度: "+httpURL.getContentLength());
- // System.out.println("创建日期: "+new Date(httpURL.getDate()));
- // System.out.println("最后修改日期: "+new Date(httpURL.getLastModified()));
- // System.out.println("终止日期: "+new Date(httpURL.getExpiration()));
- }
- catch (Exception e)
- {
- e.printStackTrace();
- }
- int httpResult = httpURL.getResponseCode();
- if (httpResult == HttpURLConnection.HTTP_OK)
- {
- bufferedReader = new BufferedReader(new InputStreamReader(httpURL.getInputStream()));
- String sContent;
- while ((sContent = bufferedReader.readLine()) != null)
- {
- sBuffer.append(sContent).append(line);
- }
- bufferedReader.close();
- }
- httpURL.disconnect();
- }
- catch (Exception ex)
- {
- ex.printStackTrace();
- }
- //
- return sBuffer.toString();
- }
- }