private WebClient getAWebClient() { WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24); webClient.getOptions().setTimeout(20000); // webClient.getCookieManager().setCookiesEnabled(true); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setJavaScriptEnabled(false); webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); webClient.addRequestHeader("Accept-Encoding", "gzip, deflate"); webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5"); webClient.addRequestHeader("Cache-Control", "max-age=0"); webClient.addRequestHeader("Connection", "keep-alive"); webClient.addRequestHeader("Host", "www.amazon.com"); webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"); return webClient; }
/** * 采集网页 */ public StringBuilder crawlPage(String url) { StringBuilder builder = new StringBuilder(); logger.info(Thread.currentThread().getName() + " crawl " + url); // mygetpage代码放在这里 webClient.getCookieManager().clearCookies(); logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();"); File file = new File(cookiePathAppendRandom()); logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());"); if (file.exists()) { FileInputStream fin = null; try { fin = new FileInputStream(file); } catch (FileNotFoundException e1) { e1.printStackTrace(); } CookieStore cookieStore = null; ObjectInputStream in; try { in = new ObjectInputStream(fin); cookieStore = (CookieStore) in.readObject(); in.close(); } catch (IOException e) { logger.error(e); } catch (ClassNotFoundException e) { logger.error(e); } Listl = cookieStore.getCookies(); for (org.apache.http.cookie.Cookie temp : l) { Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(), temp.getExpiryDate(), false); webClient.getCookieManager().addCookie(cookie); } } logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url); HtmlPage page = MyGetPage(new StringBuffer(url)); logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url); if (page == null) { // 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列 logger.info("Page null!"); AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange); exceptionFun(model); return (new StringBuilder("getNullPage")); } logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());"); builder.append(page.asXml()); logger.info(Thread.currentThread().getName() + " return builder;"); logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length()); if(builder.toString().length()<=300){ AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange); exceptionFun(model); return (new StringBuilder("getNullPage")); } return builder; }
/*** * 自定义的getpage,遇到验证码页面识别直至成功 * */ private HtmlPage MyGetPage(StringBuffer URL) { HtmlPage page = null; boolean flag = true; int TryTimeCnt = 1; int UnknowHostTryTimeCnt = 1; while (flag) { flag = false; try { logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:" + crawlURLId); page = webClient.getPage(URL.toString()); Document doc = Jsoup.parse(page.asXml()); int robotchecknum = 1; while (doc.select("title").text().equals("Robot Check")) { logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis()) + " [Robot Check,URL:" + URL + "]"); String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString())); logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis()) + " end AmazonGetCaptcha.GetCaptcha"); logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : " + captcha_str); HtmlForm form = null; logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start"); form = page.getForms().get(0); logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End"); HtmlButton button = null; logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start"); button = (HtmlButton) form.getElementsByTagName("button").get(0); logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End"); logger.info(Thread.currentThread().getName() + " setValueAttribute Start"); form.getInputByName("field-keywords").setValueAttribute(captcha_str); logger.info(Thread.currentThread().getName() + " setValueAttribute End"); logger.info(Thread.currentThread().getName() + " button.click Start"); boolean click_flag = false; while (!click_flag) { try { click_flag = true; page = button.click(); } catch (Exception e1) { logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1); //e1.printStackTrace(); click_flag = false; } } logger.info(Thread.currentThread().getName() + " button.click end"); while (page.asXml() == null) { logger.info(Thread.currentThread().getName() + " page xml null"); logger.info(Thread.currentThread().getName() +" "+ page.asXml()); page.refresh(); logger.info(Thread.currentThread().getName() + " refresh End!"); } logger.info(Thread.currentThread().getName() + " button.click End"); logger.info(Thread.currentThread().getName() + " Start ParsePage!"); doc = Jsoup.parse(page.asXml()); if (!doc.select("title").text().equals("Robot Check")) { logger.info(Thread.currentThread().getName() + " " + doc.select("title").text()); logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:" + captcha_str + ",try num:" + robotchecknum + "]"); } robotchecknum++; } } catch (FailingHttpStatusCodeException e) { logger.error(Thread.currentThread().getName() +" "+ e); flag = true; } catch (MalformedURLException e) { logger.error(Thread.currentThread().getName() +" "+ e); flag = true; }catch(UnknownHostException e) { logger.error(Thread.currentThread().getName() +" "+ e); flag = true; logger.info("found UnknownHostException,start sleep 20 min"); try { Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime"))); } catch (InterruptedException e1) { logger.error(Thread.currentThread().getName() +" "+ e1); } logger.info("found UnknownHostException,end sleep 20 min"); UnknowHostTryTimeCnt++;// 访问异常数加一 logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis()) + " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]"); if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) { return null; } }catch (Exception eq) { logger.error(Thread.currentThread().getName() + " "+eq); TryTimeCnt++;// 访问异常数加一 logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis()) + " [TryTimeCnt:" + TryTimeCnt + "]"); if (TryTimeCnt > 5) { return null; } try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); logger.error(Thread.currentThread().getName() + e); } flag = true; } try { Thread.sleep(random.nextInt(500) + 1500); } catch (InterruptedException e) { logger.error(Thread.currentThread().getName() + e); flag = true; } } return page; }