<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> <title>畅销品抓取-商品列表</title> <#include "/WEB-INF/templates/common/include/css.ftl"> <#include "/WEB-INF/templates/common/include/js.ftl"> </head> <script type="text/javascript" src="js/category/category.js"></script> <script type="text/javascript" src="js/jquery/jquery-1.4.2.min.js"></script> <script type="text/javascript" src="js/jquery/jquery.min.js"></script> <script type="text/javascript"> function initPage() { var secondSelect = document.getElementById("second"); var thirdSelect = document.getElementById("third"); if(secondSelect.length == 0) { secondSelect.style.display="none"; } else { secondSelect.style.display="block"; } if(thirdSelect.length == 0) { thirdSelect.style.display="none"; } else { thirdSelect.style.display="block"; } } var xmlHttpRequest; function getFirstLevelCat() { var siteId=document.getElementById("siteId").value; var url="${base}/firstLevelCateAjax.action?siteId="+siteId; //alert(url); xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=handleCategory; xmlHttpRequest.open("GET",url,true); xmlHttpRequest.send(null); } function handleCategory() { if(xmlHttpRequest.readyState == 4) { if(xmlHttpRequest.status == 200) { //alert(xmlHttpRequest.responseText); var array = eval(xmlHttpRequest.responseText); addOption(array[0],'first','\u8bf7\u9009\u62e9'); //addOption(array[1],'third','\u5168\u90E8'); // 控制标签隐藏,显示 initPage(); } } } function getNextLevelCat(categoryLevel,selectId) { var url="${base}/nextLevelCateAjax.action?parentCategoryId="+categoryLevel; //alert(url); xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=function(){displayCategory(selectId)}; xmlHttpRequest.open("GET",url,true); xmlHttpRequest.send(null); } function displayCategory(selectId) { if(xmlHttpRequest.readyState == 4) { if(xmlHttpRequest.status == 200) { //alert(xmlHttpRequest.responseText); var array = eval(xmlHttpRequest.responseText); addOption(array[0],selectId,'\u8bf7\u9009\u62e9'); //addOption(array[1],'third','\u5168\u90E8'); // 控制标签隐藏,显示 initPage(); } } } function crawlProduct(type) { var fetchSize = document.getElementById("fetchSize").value; var re = /^[\d]+$/ if(re.test(fetchSize) == false) { alert("抓取数量必须为数字!"); return; } if(fetchSize<0) { alert("抓取数量不能为负数!"); return; } var categoryId; var firstSelect = document.getElementById("first"); var secondSelect = document.getElementById("second"); var thirdSelect = document.getElementById("third"); if(thirdSelect != undefined && thirdSelect.value != '') { categoryId = thirdSelect.value; } else if(($("input[type=checkbox]")).length>0){ //alert("成功进入!") var pcsgroups =$("input[type=checkbox]"); var textV=""; for(i=0;i<pcsgroups.length;i++){ if($(pcsgroups[i]).attr("checked")){ textV =textV + $(pcsgroups[i]).attr("value") + ","; textV =textV + $("#checkbox"+$(pcsgroups[i]).attr("value")).val() + ","; } } //textV = textV.slice(0, -1); categoryId = textV; } else if(secondSelect != undefined && secondSelect.value != '') { categoryId = secondSelect.value; } else { categoryId = firstSelect.value; } if(categoryId == '') { alert("请选择分类!"); return; } document.getElementById("categoryId").value = categoryId; var bestsellerForm = document.getElementById("bestsellerForm"); bestsellerForm.action = "crawlBestseller.action"; if(type=="children") { bestsellerForm.action = "crawlAllChildren.action"; } bestsellerForm.submit(); } function exportProduct() { var bestsellerForm = document.getElementById("bestsellerForm"); bestsellerForm.action = "exportBestseller.action"; bestsellerForm.submit(); } </script> <body onload="initPage(),getFirstLevelCat();"> <div class="pis_width"> <!-- head begin --> <#include "/WEB-INF/templates/head.ftl"> <!-- head end --> <!-- left begin --> <#include "/WEB-INF/templates/left.ftl"> <!-- left end --> <!-- content begin --> <div class="pis_content"> <div><font color="red"><b>您的位置:畅销品抓取-商品列表</b></font></div> </br> <div><b>请设置待匹配一号店商品:</b></div> <@s.form id="bestsellerForm" name="productForm" action="${base}/addCategoryMatch.action" method="post"> <table class="mt10" width="98%" align="center" border="0" cellpadding="0" cellspacing="0"> <tbody valign="middle"> <tr bgcolor="#edf5fa"> <td width="20%">网站名称:</td> <td style="padding: 0px;" colspan="2"> <select id="siteId" name="siteCategory.siteId" value="" onchange="getFirstLevelCat();"> <#list siteList as site> <option value="${site.siteId}">${site.siteName}</option> </#list> </select> </td> </tr> <tr bgcolor="#edf5fa"> <td width="20%">抓取数量:</td> <td style="padding: 0px;" colspan="2"> <input type="text" id="fetchSize" name="siteCategory.fetchSize" value="100"/> <font color="red">*</font> </td> </tr> <tr><td colspan="3"></td></tr> <tr bgcolor="#edf5fa"> <input type="hidden" id="categoryId" name="siteCategory.ids" value="" /> <td width="20%">一级分类:</td> <td style="padding: 0px;"> <select id="first" onchange="getNextLevelCat(this.value,'second');"></select> </td> <td style="padding: 0px;"> <input type="button" value="按分类抓取" class="button2" onclick="crawlProduct('this');"/> <input type="button" value="抓取子类商品" class="button2" onclick="crawlProduct('children');"/> </td> </tr> <tr bgcolor="#edf5fa"> <td width="20%">二级分类:</td> <td width="80%" colspan="2"> <span id="level2"> <select id="second" onchange="getNextLevelCat(this.value,'third');"></select> </span> </td> </tr> <tr bgcolor="#edf5fa"> <td width="20%">三级分类:</td> <td colspan="2"> <span id="level3"><select id="third"></select> </span> <span id="level31"></span> </td> </tr> </tbody> </table> <hr/> <input type="hidden" id="pageUrl" name="matchProductDto.pageUrl" value="" /> <#if bestsellerList ??> <div><font color="red"><b> <#if siteCategory ??> 目标网站:${siteCategory.siteName} 分类名称:${siteCategory.categoryName} 抓取商品数目:${siteCategory.fetchSize} 实际商品数目:${size} </#if> </b></font></div> <br> <div><input type="button" value="导出畅销商品" class="button2" onclick="exportProduct();"/></div> <br> <table id="productTable" class="mt10" width="98%" align="center" border="0" cellpadding="0" cellspacing="1"> <thead> <th align="center" width="5%">畅销排名</th> <th align="center" width="20%">商品名称</th> <th align="center" width="10%">商品价格</th> <th align="center" width="10%">商品URL</th> </thead> <tbody id="searchProduct"> <#list bestsellerList as crawledProduct> <tr bgcolor="#edf5fa"> <td align="center">${crawledProduct_index+1}</td> <td align="center">${crawledProduct.name}</td> <#if crawledProduct.imgPrice ??> <td align="center"><img src="${crawledProduct.imgPrice!''}"/>(${crawledProduct.price!''})</td> <#elseif crawledProduct.price?exists> <td align="center">${crawledProduct.price!''}</td> <#else> <td align="center">缺货</td> </#if> <td align="center"><a href="${crawledProduct.url}" target="_blank">查看商品</a></td> </tr> </#list> </#if> </tbody> </table> </div> </@s.form> </div> </div> <!-- content end --> </body> </html>
//document.charset="UTF-8"; function createXmlHttpRequest() { if(window.ActiveXObject) { return new ActiveXObject("Microsoft.XMLHTTP"); } else if (window.XMLHttpRequest) // For general cases. { return new XMLHttpRequest(); } } function getCategory(process,result,firstValue,secondValue) { // 隐藏标签 var level2=document.getElementById("level2"); var level3=document.getElementById("level3"); var product=document.getElementById("productName"); document.getElementById('third').options.length=1; if(undefined != document.getElementById('product')) { document.getElementById('product').options.length=1; } if(firstValue=="") { level2.style.display="none"; level3.style.display="none"; if(product != undefined) { product.style.display="none"; } } else if(secondValue=="") { level3.style.display="none"; if(product != undefined) { product.style.display="none"; } } else if(level3=="") { if(product != undefined) { product.style.display="none"; } } var url="${base}/category_getCategoryAjax.action?process="+process+"&firstId="+firstValue+"&secondId="+secondValue; xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=result; xmlHttpRequest.open("GET",url,true); xmlHttpRequest.send(null); } function getProduct(categoryId,processMethod) { if(categoryId=="") { categoryId = document.getElementById("second").value; } var url="${base}/getProduct.action?categoryId="+categoryId; xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=processMethod; xmlHttpRequest.open("GET",url,true); xmlHttpRequest.send(null); } function getMatchProduct(categoryId,processMethod) { document.getElementById('product').selectedIndex=0; if(categoryId=="") { categoryId = document.getElementById("second").value; } var url="${base}/matchProduct_getProductByCategory.action?matchProductDto.categoryId="+categoryId; xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=processMethod; xmlHttpRequest.open("GET",url,true); xmlHttpRequest.send(null); } function uniencode(text) { text = escape(text.toString()).replace(/\+/g, "%2B"); var matches = text.match(/(%([0-9A-F]{2}))/gi); if (matches) { for (var matchid = 0; matchid < matches.length; matchid++) { var code = matches[matchid].substring(1,3); if (parseInt(code, 16) >= 128) { text = text.replace(matches[matchid], '%u00' + code); } } } text = text.replace('%25', '%u0025'); return text; } function getCrawledProduct(productId,siteId,keyword,processMethod) { // var keyword = escape(keyword); // alert(keyword); // keyword.replace('%','\\'); // keyword="\u98DE\u5229\u6D66\u5243\u987B\u5200"; // uniencode(keyword); // keyword = uniencode(keyword); // keyword = encodeURIComponent(keyword); // alert(keyword); var url="${base}/crawlProductByKeyword.action?productId="+productId+"&siteId="+siteId+"&keyword="+keyword; // url=encodeURIComponent(url); // url=encodeURI(url); // alert(url); xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=processMethod; // xmlHttpRequest.open("POST",url,true); xmlHttpRequest.open("GET", url, true); xmlHttpRequest.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded'); xmlHttpRequest.setRequestHeader("Content-Type","text/html"); xmlHttpRequest.setRequestHeader("Content-Type","utf-8"); xmlHttpRequest.send(null); } // 页面初始化时,加载一级分类,屏蔽二三级分类和产品 function InitResult() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var array = eval(xmlHttpRequest.responseText); addOption(array[0],'first','\u9009\u62e9\u76ee\u5f55'); // addOption(array[1],'second','\u5168\u90E8'); // addOption(array[2],'third','\u5168\u90E8'); } // 隐藏标签 var level2=document.getElementById("level2"); var level3=document.getElementById("level3"); var product=document.getElementById("productName"); level2.style.display="none"; level3.style.display="none"; if(product != undefined) { product.style.display="none"; } } function SecondResult() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var array = eval(xmlHttpRequest.responseText); addOption(array[0],'second','\u5168\u90E8'); // addOption(array[1],'third','\u5168\u90E8'); // 控制标签隐藏,显示 var level2=document.getElementById("level2"); level2.style.display="block"; } } function ThirdResult() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var array = eval(xmlHttpRequest.responseText); addOption(array[0],'third','\u5168\u90E8'); // 控制标签隐藏,显示 var level3=document.getElementById("level3"); level3.style.display="block"; } } function addOption(array,id,head) { var first = document.getElementById(id); var othershowstyle = document.getElementById('level31') ; var normalshowstyle = document.getElementById('level3') ; first.options.length=0; if(array.length == 0){ othershowstyle.style.display="none"; normalshowstyle.style.display="block"; if(head!='') { var newOption = document.createElement("OPTION"); newOption.text=head; newOption.value=''; first.options.add(newOption); } }else if(array.length>0){ //alert('array[0].siteId'+array[0].siteId+'id' +id); //线上id为10 if(array[0].siteId == '9' && id == 'third'){ //alert('成功进入!'); othershowstyle.style.display="block"; normalshowstyle.style.display="none"; var htmlstring=''; for(i=0;i<array.length;i++) { htmlstring=htmlstring+'<input type=\"checkbox\" name=\"checkbox\" value=\"'+array[i].id+'\"/>'+trim(array[i].categoryName); htmlstring = htmlstring +'<input type=\"text\" id=\"checkbox'+array[i].id+'\" name=\"checkboxvalue\"/><br/>'; } //alert(htmlstring); othershowstyle.innerHTML=htmlstring; }else{ othershowstyle.style.display="none"; normalshowstyle.style.display="block"; if(head!='') { var newOption = document.createElement("OPTION"); newOption.text=head; newOption.value=''; first.options.add(newOption); } for(i=0;i<array.length;i++) { var newOption = document.createElement("OPTION"); newOption.text=trim(array[i].categoryName); newOption.value=array[i].id; first.options.add(newOption); } } } } function listProduct() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var array = eval(xmlHttpRequest.responseText); var first = document.getElementById("product"); first.options.length=0; var newOption = document.createElement("OPTION"); // newOption.text='\u9009\u62e9\u76ee\u5f55'; newOption.text='\u9009\u62e9\u5546\u54c1'; newOption.value=''; first.options.add(newOption); var array=array[0]; for(i=0;i<array.length;i++) { var newOption = document.createElement("OPTION"); newOption.text=trim(array[i].productCname); newOption.value=array[i].id; first.options.add(newOption); } // 控制标签隐藏,显示 var product=document.getElementById("productName"); product.style.display="block"; } } function listSearchProduct() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var array = eval(xmlHttpRequest.responseText); alert("array="+array); for(i=0;i<array.length;i++) { var table=document.getElementById("productTable"); var length=table.rows.length; var tr=document.createElement("tr"); tr.id=++i; var td0=document.createElement("td"); td0.innerHTML="<input type=\"text\" value=\""+array[i]+"\"/>"; // 不用文本框下面取不出来。 (因遍历的方式需要不带文本框) tr.appendChild(td0); var td1=document.createElement("td"); tdname="第"+i+"行 "+tdname; td1.innerHTML="<input type=\"text\" value=\""+tdname+"\" name=\"name\" id=\"name\"/>"; tr.appendChild(td1); var td2=document.createElement("td"); td2.innerHTML="<input type=\"text\" value=\""+unit+"\" name=\"unit\"/>"; tr.appendChild(td2); var td3=document.createElement("td"); // td3.id=tr.id; td3.innerHTML="<input type=\"button\" value=\"del\" onclick=\"del(this.parentElement.parentElement,this.parentElement.parentElement.id)\"/>"; tr.appendChild(td3); } // 控制标签隐藏,显示 var product=document.getElementById("productName"); product.style.display="block"; } } function addRow(i) { // i为行的id // 添加行 var table=document.getElementById("productTable"); var length=table.rows.length; var tr=document.createElement("tr"); tr.id=++i; var td0=document.createElement("td"); td0.innerHTML="<input type=\"text\" value=\""+num+"\"/>"; // 不用文本框下面取不出来。 (因遍历的方式需要不带文本框) tr.appendChild(td0); var td1=document.createElement("td"); tdname="第"+i+"行 "+tdname; td1.innerHTML="<input type=\"text\" value=\""+tdname+"\" name=\"name\" id=\"name\"/>"; tr.appendChild(td1); var td2=document.createElement("td"); td2.innerHTML="<input type=\"text\" value=\""+unit+"\" name=\"unit\"/>"; tr.appendChild(td2); var td3=document.createElement("td"); // td3.id=tr.id; td3.innerHTML="<input type=\"button\" value=\"del\" onclick=\"del(this.parentElement.parentElement,this.parentElement.parentElement.id)\"/>"; tr.appendChild(td3); addData(i); // 含数据验证 if(flag==true) { document.getElementById("newbody").appendChild (tr); } else { i--; } } function listMatchProduct() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var first = document.getElementById("product"); first.options.length=0; var array = eval(xmlHttpRequest.responseText); var newOption = document.createElement("OPTION"); newOption.text='\u9009\u62e9\u76ee\u5f55'; newOption.value=''; first.options.add(newOption); for(i=0;i<array[0].length;i++) { var newOption = document.createElement("OPTION"); newOption.text=trim(array[0][i].name); newOption.value=array[0][i].productId; first.options.add(newOption); } // 控制标签隐藏,显示 var product=document.getElementById("productName"); product.style.display="block"; } } function trim(str) { return str.replace(/^\s+|\s+$/g, ""); }
public String crawlBestseller() { int id = 0; int amount = 0;//siteCategory.getFetchSize(); int index = 1; String CateName = ""; int fetchSize = 0 ; String[] idStrings = null; //针对苏宁易购需求所做的修改 if (siteCategory.getIds().indexOf(",")>0) {//说明是苏宁的多选子类 idStrings =siteCategory.getIds().split(","); index = idStrings.length; } for (int i = 0; i < index; i=i+2) {//一般情况下是执行一次,苏宁易购有执行多次的可能 List<BestSellerDto> transferList ; if (index>1) { id=Integer.parseInt(idStrings[i]); amount = Integer.parseInt(idStrings[i+1]); }else{ id=Integer.parseInt(siteCategory.getIds()); amount=siteCategory.getFetchSize(); } // 根据id获取分类信息 siteCategory = siteCategorySvc.getSiteCategoryById(id); fetchSize+=amount; if (CateName == "") { CateName = siteCategory.getCategoryName(); }else { CateName = CateName + "-"+siteCategory.getCategoryName(); } // 追加网站名称 // 将网站信息放入内存map List<SiteDto> siteList = (List<SiteDto>)ActionContext.getContext().getSession().get("siteList"); Map<Integer,SiteDto> siteMap = new HashMap<Integer,SiteDto>(); for(SiteDto site : siteList) { siteMap.put(site.getSiteId(), site); } String siteName = siteMap.get(siteCategory.getSiteId()).getSiteName(); siteCategory.setSiteName(siteName); ActionContext.getContext().getSession().put("params", siteCategory); // 获取畅销商品列表 transferList = bestsellerSvc.fetchBestSeller(id, amount); for(BestSellerDto bestseller : transferList) { String price = bestseller.getPrice(); if( price != null && price.contains("http")) { // 解析图片价格 bestseller.setImgPrice(price); BigDecimal decimal = ImgUtils.getDecimalFromImg(price); if(decimal != null) { bestseller.setPrice(decimal.toString()); } } } if (i==0) { bestsellerList = transferList; }else { bestsellerList.addAll(transferList); } } siteCategory.setCategoryName(CateName); siteCategory.setFetchSize(fetchSize); ActionContext.getContext().getSession().put("topProducts", bestsellerList); ServletActionContext.getContext().put("size", bestsellerList.size()); return SUCCESS; }
public List<BestSellerDto> fetchBestSeller(Integer id, int amount) {
List<BestSellerDto> bestSellers = new ArrayList<BestSellerDto>();
if (amount <= 0) {
logger.warn("畅销品数目不能为负数!");
return null;
}
SiteCategoryDto siteCategory = getSiteCategoryById(id);
// 抓取参数
Map<String, String> params;
// 组织抓取参数与页面参数
params = pageParamItemDao.getPageConfigBySite(siteCategory.getSiteId());
params.putAll(crawlerParamItemDao.getCrawlConfigBySite(siteCategory.
getSiteId()));
// 构建抓取对象
String charset = params.get(PageParamNames.CONTENT_ENCODING);
//int pageSize = Integer.parseInt(params.get(PageParamNames.BS_PAGE_SIZE));
int pageSize = 20;
int pages = amount / pageSize + 1;
logger.info("畅销榜页数:" + pages);
Crawler crawler = new Crawler(charset);
String bestSellerHtml = null;
PageParser pageParser = null;
try {
String categoryUrl = siteCategory.getCategoryUrl();
if (categoryUrl == null || categoryUrl.equals("")) {
logger.info("此分类不支持畅销榜!");
return null;
}
bestSellerHtml = crawler.crawl(siteCategory.getCategoryUrl());
pageParser = PageParserFactory.createPageParser(null, charset,
params);
String nextPageUrlPattern=pageParser.extractNextPageUrlPattern(bestSellerHtml);
if (siteCategory.getCategoryUrl().indexOf("suning")>0) {
String nextPageUrl0 = pageParser.getNextPageUrl(
nextPageUrlPattern, 1);
bestSellerHtml = crawler.crawl(nextPageUrl0);
}
bestSellers = pageParser.extractBestSeller(bestSellerHtml);
//if(amount<=bestSellers.size()) return bestSellers;
if (nextPageUrlPattern != null) {
for (int pageNum = 2; pageNum <= pages; pageNum++) {
logger.info("抓取畅销榜第 " + pageNum + " 页");
String nextPageUrl = pageParser.getNextPageUrl(
nextPageUrlPattern, pageNum);
bestSellerHtml = crawler.crawl(nextPageUrl);
List<BestSellerDto> moreBestSellers = pageParser.
extractBestSeller(bestSellerHtml);
if (moreBestSellers == null || moreBestSellers.isEmpty()) {
break;
}
if (bestSellers.get(bestSellers.size() - 1).getUrl().equals(
moreBestSellers.get(moreBestSellers.size() - 1).
getUrl())) {
break;
}
if (moreBestSellers != null && !moreBestSellers.isEmpty()) {
bestSellers.addAll(moreBestSellers);
}
}
}
} catch (IOException e) {
logger.error("抓取 " + siteCategory.getCategoryName() + " 畅销榜时出现异常!"
+ "URL为:" + siteCategory.getCategoryUrl(), e);
} catch (ParserException e) {
logger.error("解析畅销榜页面时出现异常!" + "URL为:"
+ siteCategory.getCategoryUrl(), e);
}
logger.info("++++++++++++++++++++++++++++++++++");
logger.info("畅销集合大小:" + (bestSellers == null ? 0 : bestSellers.size()));
if (bestSellers != null) {
if (bestSellers.size() <= amount) {
return bestSellers;
}
List<BestSellerDto> subList = bestSellers.subList(0, amount);
logger.info("----------------------------------");
logger.info("畅销集合大小:" + subList.size());
return subList;
} else {
return new ArrayList<BestSellerDto>();
}
}
package com.yihaodian.pis.crawler; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.tags.Bullet; import org.htmlparser.tags.BulletList; import org.htmlparser.tags.Div; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.ScriptTag; import org.htmlparser.tags.Span; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import com.yihaodian.pis.dto.BestSellerDto; public class SuningPageParser extends PageParser{ private static final Logger logger = Logger.getLogger(SuningPageParser.class); public SuningPageParser(String html, String charset) { super(html, charset); // TODO Auto-generated constructor stub } @Override public List<BestSellerDto> extractBestSeller(String bestSellerHtml) throws ParserException { List<BestSellerDto> sellers = new ArrayList<BestSellerDto>(); Parser parser = Parser.createParser(bestSellerHtml, charset); NodeFilter filter = new HasAttributeFilter("class", "product_list02 profix02 clearfix"); NodeList nodeList = parser.extractAllNodesThatMatch(filter); if (nodeList == null || nodeList.size() == 0) { return null; } Node div = nodeList.elementAt(0); NodeList divchildren = div.getChildren(); BulletList ul = (BulletList)divchildren.elementAt(1); NodeList children = ul.getChildren(); BestSellerDto bestSeller = null; for (int i = 0; i < children.size(); i++) { bestSeller = new BestSellerDto(); Node child = children.elementAt(i); if (child instanceof Bullet) { Bullet li = (Bullet) child; Span nameDiv = (Span) findTagByClassName(li, "pro_intro"); //寻找tagName是 LinkTag 的那个 LinkTag link = (LinkTag)findTagByName(nameDiv,"LinkTag"); bestSeller.setName(link.getLinkText()); bestSeller.setUrl("http://www.suning.cn"+link.getLink()); Span pricespan = (Span) findTagByClassName(li, "pro_price"); String priceteString =pricespan.getChildrenHTML().replace("<em>", "").replace("</em>", "").replace("¥", ""); bestSeller.setPrice(priceteString); logger.info("畅销单品:" + bestSeller); sellers.add(bestSeller); } else { continue; } } return sellers; } @Override public String extractNextPageUrlPattern(String bestSellerHtml) throws ParserException { String nextPageUrl=""; Parser parser = Parser.createParser(bestSellerHtml, "utf-8"); NodeFilter filter = new HasAttributeFilter("type", "text/javascript");; NodeList children = parser.extractAllNodesThatMatch(filter); if (children == null || children.size() == 0) { System.out.println("没有值"); }else{ System.out.println("有值"); } for (int i = 0; i < children.size(); i++) { ScriptTag child = (ScriptTag) children.elementAt(i); if(child.findPositionOf("¤tPage=")==0){ String putInCart1 = null; String putInCart2 = null; //Pattern pattern2 = Pattern.compile("(?<=currentPage[)] \\{)([^\\}]*?)(?=\\})"); Pattern pattern2 = Pattern.compile("(?<=var[ \\s]{0,100}(jumpUrl)[\\s]{0,100}[=][\\s]{0,100}[\"])(.*?)(?=\"\\s{0,100}[+])"); Pattern pattern1 = Pattern.compile("(?<=var[ \\s]{0,100}dfy\\s{0,100}=\\s{0,100}[\"])(.*?)(?=[\"][\\s]{0,100})"); Matcher matcher1 = pattern1.matcher(child.getChildrenHTML()); if (matcher1.find()) { putInCart1 = matcher1.group(0).trim(); } Matcher matcher2 = pattern2.matcher(child.getChildrenHTML()); if (matcher2.find()) { putInCart2 = matcher2.group(0).trim(); } //System.out.println(putInCart2.substring(15, putInCart2.indexOf(" + dfy")-1)); nextPageUrl=putInCart2+putInCart1; } } // parser = Parser.createParser(bestSellerHtml, "utf-8"); //得到当前页currentPage // String currentPage=""; // filter = new HasAttributeFilter("class", "on"); // children = parser.extractAllNodesThatMatch(filter); //LinkTag dLinkTag = (LinkTag) children.elementAt(0); //System.out.println(dLinkTag.getLinkText()); // for (int i = 0; i < children.size(); i++) { // Node node =children.elementAt(i); // if (node.getChildren().size()<2) { // LinkTag dLinkTag = (LinkTag)node; // if(dLinkTag.getLink().equals("#")) // currentPage= dLinkTag.getLinkText(); // } // } nextPageUrl+="&ip_sortBy=salevolumn0&sortType=4¤tPage=";// + currentPage; logger.info("畅销榜下一页URL模式:" + nextPageUrl); return nextPageUrl; } @Override public String getNextPageUrl(String nextPageUrlPattern, int pageNum) { StringBuilder sb = new StringBuilder(); sb.append("http://www.suning.cn/webapp/wcs/stores/servlet/"); sb.append(nextPageUrlPattern+(pageNum-1)); String nextPageUrl = sb.toString(); return nextPageUrl; } @Override public String extractName(Map<String, String> params) throws ParserException { // TODO Auto-generated method stub return null; } @Override public String extractPrice(Map<String, String> params) throws ParserException { // TODO Auto-generated method stub return null; } @Override public String extractBrand(Map<String, String> params) throws ParserException { // TODO Auto-generated method stub return null; } @Override public String extractImageUrl(Map<String, String> params) throws ParserException { // TODO Auto-generated method stub return null; } @Override public boolean hasProduct(Map<String, String> params) throws ParserException { // TODO Auto-generated method stub return false; } }
此是用来抓取各大网站的目录然后对目录下的商品进行解析得到一个list。
不懂的联系QQ526151410