畅销品抓取-商品列表
<#include "/WEB-INF/templates/common/include/css.ftl">
<#include "/WEB-INF/templates/common/include/js.ftl">
<#include "/WEB-INF/templates/head.ftl">
<#include "/WEB-INF/templates/left.ftl">
您的位置:畅销品抓取-商品列表
请设置待匹配一号店商品:
<@s.form id="bestsellerForm" name="productForm" action="${base}/addCategoryMatch.action" method="post">
网站名称:
抓取数量:
*
一级分类:
二级分类:
三级分类:
<#if bestsellerList ??>
<#if siteCategory ??>
目标网站:${siteCategory.siteName}
分类名称:${siteCategory.categoryName}
抓取商品数目:${siteCategory.fetchSize}
实际商品数目:${size}
#if>
畅销排名
商品名称
商品价格
商品URL
<#list bestsellerList as crawledProduct>
${crawledProduct_index+1}
${crawledProduct.name}
<#if crawledProduct.imgPrice ??>
(${crawledProduct.price!''})
<#elseif crawledProduct.price?exists>
${crawledProduct.price!''}
<#else>
缺货
#if>
查看商品
#list>
#if>
@s.form>
//document.charset="UTF-8"; function createXmlHttpRequest() { if(window.ActiveXObject) { return new ActiveXObject("Microsoft.XMLHTTP"); } else if (window.XMLHttpRequest) // For general cases. { return new XMLHttpRequest(); } } function getCategory(process,result,firstValue,secondValue) { // 隐藏标签 var level2=document.getElementById("level2"); var level3=document.getElementById("level3"); var product=document.getElementById("productName"); document.getElementById('third').options.length=1; if(undefined != document.getElementById('product')) { document.getElementById('product').options.length=1; } if(firstValue=="") { level2.style.display="none"; level3.style.display="none"; if(product != undefined) { product.style.display="none"; } } else if(secondValue=="") { level3.style.display="none"; if(product != undefined) { product.style.display="none"; } } else if(level3=="") { if(product != undefined) { product.style.display="none"; } } var url="${base}/category_getCategoryAjax.action?process="+process+"&firstId="+firstValue+"&secondId="+secondValue; xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=result; xmlHttpRequest.open("GET",url,true); xmlHttpRequest.send(null); } function getProduct(categoryId,processMethod) { if(categoryId=="") { categoryId = document.getElementById("second").value; } var url="${base}/getProduct.action?categoryId="+categoryId; xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=processMethod; xmlHttpRequest.open("GET",url,true); xmlHttpRequest.send(null); } function getMatchProduct(categoryId,processMethod) { document.getElementById('product').selectedIndex=0; if(categoryId=="") { categoryId = document.getElementById("second").value; } var url="${base}/matchProduct_getProductByCategory.action?matchProductDto.categoryId="+categoryId; xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=processMethod; xmlHttpRequest.open("GET",url,true); xmlHttpRequest.send(null); } function uniencode(text) { text = escape(text.toString()).replace(/\+/g, "%2B"); var matches = text.match(/(%([0-9A-F]{2}))/gi); if (matches) { for (var matchid = 0; matchid < matches.length; matchid++) { var code = matches[matchid].substring(1,3); if (parseInt(code, 16) >= 128) { text = text.replace(matches[matchid], '%u00' + code); } } } text = text.replace('%25', '%u0025'); return text; } function getCrawledProduct(productId,siteId,keyword,processMethod) { // var keyword = escape(keyword); // alert(keyword); // keyword.replace('%','\\'); // keyword="\u98DE\u5229\u6D66\u5243\u987B\u5200"; // uniencode(keyword); // keyword = uniencode(keyword); // keyword = encodeURIComponent(keyword); // alert(keyword); var url="${base}/crawlProductByKeyword.action?productId="+productId+"&siteId="+siteId+"&keyword="+keyword; // url=encodeURIComponent(url); // url=encodeURI(url); // alert(url); xmlHttpRequest=createXmlHttpRequest(); xmlHttpRequest.onreadystatechange=processMethod; // xmlHttpRequest.open("POST",url,true); xmlHttpRequest.open("GET", url, true); xmlHttpRequest.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded'); xmlHttpRequest.setRequestHeader("Content-Type","text/html"); xmlHttpRequest.setRequestHeader("Content-Type","utf-8"); xmlHttpRequest.send(null); } // 页面初始化时,加载一级分类,屏蔽二三级分类和产品 function InitResult() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var array = eval(xmlHttpRequest.responseText); addOption(array[0],'first','\u9009\u62e9\u76ee\u5f55'); // addOption(array[1],'second','\u5168\u90E8'); // addOption(array[2],'third','\u5168\u90E8'); } // 隐藏标签 var level2=document.getElementById("level2"); var level3=document.getElementById("level3"); var product=document.getElementById("productName"); level2.style.display="none"; level3.style.display="none"; if(product != undefined) { product.style.display="none"; } } function SecondResult() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var array = eval(xmlHttpRequest.responseText); addOption(array[0],'second','\u5168\u90E8'); // addOption(array[1],'third','\u5168\u90E8'); // 控制标签隐藏,显示 var level2=document.getElementById("level2"); level2.style.display="block"; } } function ThirdResult() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var array = eval(xmlHttpRequest.responseText); addOption(array[0],'third','\u5168\u90E8'); // 控制标签隐藏,显示 var level3=document.getElementById("level3"); level3.style.display="block"; } } function addOption(array,id,head) { var first = document.getElementById(id); var othershowstyle = document.getElementById('level31') ; var normalshowstyle = document.getElementById('level3') ; first.options.length=0; if(array.length == 0){ othershowstyle.style.display="none"; normalshowstyle.style.display="block"; if(head!='') { var newOption = document.createElement("OPTION"); newOption.text=head; newOption.value=''; first.options.add(newOption); } }else if(array.length>0){ //alert('array[0].siteId'+array[0].siteId+'id' +id); //线上id为10 if(array[0].siteId == '9' && id == 'third'){ //alert('成功进入!'); othershowstyle.style.display="block"; normalshowstyle.style.display="none"; var htmlstring=''; for(i=0;i'+trim(array[i].categoryName); htmlstring = htmlstring +'
'; } //alert(htmlstring); othershowstyle.innerHTML=htmlstring; }else{ othershowstyle.style.display="none"; normalshowstyle.style.display="block"; if(head!='') { var newOption = document.createElement("OPTION"); newOption.text=head; newOption.value=''; first.options.add(newOption); } for(i=0;i"; // 不用文本框下面取不出来。 (因遍历的方式需要不带文本框) tr.appendChild(td0); var td1=document.createElement("td"); tdname="第"+i+"行 "+tdname; td1.innerHTML=""; tr.appendChild(td1); var td2=document.createElement("td"); td2.innerHTML=""; tr.appendChild(td2); var td3=document.createElement("td"); // td3.id=tr.id; td3.innerHTML=""; tr.appendChild(td3); } // 控制标签隐藏,显示 var product=document.getElementById("productName"); product.style.display="block"; } } function addRow(i) { // i为行的id // 添加行 var table=document.getElementById("productTable"); var length=table.rows.length; var tr=document.createElement("tr"); tr.id=++i; var td0=document.createElement("td"); td0.innerHTML=""; // 不用文本框下面取不出来。 (因遍历的方式需要不带文本框) tr.appendChild(td0); var td1=document.createElement("td"); tdname="第"+i+"行 "+tdname; td1.innerHTML=""; tr.appendChild(td1); var td2=document.createElement("td"); td2.innerHTML=""; tr.appendChild(td2); var td3=document.createElement("td"); // td3.id=tr.id; td3.innerHTML=""; tr.appendChild(td3); addData(i); // 含数据验证 if(flag==true) { document.getElementById("newbody").appendChild (tr); } else { i--; } } function listMatchProduct() { if(xmlHttpRequest.readyState==4 && xmlHttpRequest.status==200) { var first = document.getElementById("product"); first.options.length=0; var array = eval(xmlHttpRequest.responseText); var newOption = document.createElement("OPTION"); newOption.text='\u9009\u62e9\u76ee\u5f55'; newOption.value=''; first.options.add(newOption); for(i=0;i 3、如果是得到下拉列表的一个链接直接根据ID号从数据库读取,然后再进行网页的抓取,如果需求是将多个子URL的数据显示在一个页面上,那么就附带java代码
public String crawlBestseller() { int id = 0; int amount = 0;//siteCategory.getFetchSize(); int index = 1; String CateName = ""; int fetchSize = 0 ; String[] idStrings = null; //针对苏宁易购需求所做的修改 if (siteCategory.getIds().indexOf(",")>0) {//说明是苏宁的多选子类 idStrings =siteCategory.getIds().split(","); index = idStrings.length; } for (int i = 0; i < index; i=i+2) {//一般情况下是执行一次,苏宁易购有执行多次的可能 List
transferList ; if (index>1) { id=Integer.parseInt(idStrings[i]); amount = Integer.parseInt(idStrings[i+1]); }else{ id=Integer.parseInt(siteCategory.getIds()); amount=siteCategory.getFetchSize(); } // 根据id获取分类信息 siteCategory = siteCategorySvc.getSiteCategoryById(id); fetchSize+=amount; if (CateName == "") { CateName = siteCategory.getCategoryName(); }else { CateName = CateName + "-"+siteCategory.getCategoryName(); } // 追加网站名称 // 将网站信息放入内存map List siteList = (List )ActionContext.getContext().getSession().get("siteList"); Map siteMap = new HashMap (); for(SiteDto site : siteList) { siteMap.put(site.getSiteId(), site); } String siteName = siteMap.get(siteCategory.getSiteId()).getSiteName(); siteCategory.setSiteName(siteName); ActionContext.getContext().getSession().put("params", siteCategory); // 获取畅销商品列表 transferList = bestsellerSvc.fetchBestSeller(id, amount); for(BestSellerDto bestseller : transferList) { String price = bestseller.getPrice(); if( price != null && price.contains("http")) { // 解析图片价格 bestseller.setImgPrice(price); BigDecimal decimal = ImgUtils.getDecimalFromImg(price); if(decimal != null) { bestseller.setPrice(decimal.toString()); } } } if (i==0) { bestsellerList = transferList; }else { bestsellerList.addAll(transferList); } } siteCategory.setCategoryName(CateName); siteCategory.setFetchSize(fetchSize); ActionContext.getContext().getSession().put("topProducts", bestsellerList); ServletActionContext.getContext().put("size", bestsellerList.size()); return SUCCESS; } public List
fetchBestSeller(Integer id, int amount) { List bestSellers = new ArrayList (); if (amount <= 0) { logger.warn("畅销品数目不能为负数!"); return null; } SiteCategoryDto siteCategory = getSiteCategoryById(id); // 抓取参数 Map params; // 组织抓取参数与页面参数 params = pageParamItemDao.getPageConfigBySite(siteCategory.getSiteId()); params.putAll(crawlerParamItemDao.getCrawlConfigBySite(siteCategory. getSiteId())); // 构建抓取对象 String charset = params.get(PageParamNames.CONTENT_ENCODING); //int pageSize = Integer.parseInt(params.get(PageParamNames.BS_PAGE_SIZE)); int pageSize = 20; int pages = amount / pageSize + 1; logger.info("畅销榜页数:" + pages); Crawler crawler = new Crawler(charset); String bestSellerHtml = null; PageParser pageParser = null; try { String categoryUrl = siteCategory.getCategoryUrl(); if (categoryUrl == null || categoryUrl.equals("")) { logger.info("此分类不支持畅销榜!"); return null; } bestSellerHtml = crawler.crawl(siteCategory.getCategoryUrl()); pageParser = PageParserFactory.createPageParser(null, charset, params); String nextPageUrlPattern=pageParser.extractNextPageUrlPattern(bestSellerHtml); if (siteCategory.getCategoryUrl().indexOf("suning")>0) { String nextPageUrl0 = pageParser.getNextPageUrl( nextPageUrlPattern, 1); bestSellerHtml = crawler.crawl(nextPageUrl0); } bestSellers = pageParser.extractBestSeller(bestSellerHtml); //if(amount<=bestSellers.size()) return bestSellers; if (nextPageUrlPattern != null) { for (int pageNum = 2; pageNum <= pages; pageNum++) { logger.info("抓取畅销榜第 " + pageNum + " 页"); String nextPageUrl = pageParser.getNextPageUrl( nextPageUrlPattern, pageNum); bestSellerHtml = crawler.crawl(nextPageUrl); List moreBestSellers = pageParser. extractBestSeller(bestSellerHtml); if (moreBestSellers == null || moreBestSellers.isEmpty()) { break; } if (bestSellers.get(bestSellers.size() - 1).getUrl().equals( moreBestSellers.get(moreBestSellers.size() - 1). getUrl())) { break; } if (moreBestSellers != null && !moreBestSellers.isEmpty()) { bestSellers.addAll(moreBestSellers); } } } } catch (IOException e) { logger.error("抓取 " + siteCategory.getCategoryName() + " 畅销榜时出现异常!" + "URL为:" + siteCategory.getCategoryUrl(), e); } catch (ParserException e) { logger.error("解析畅销榜页面时出现异常!" + "URL为:" + siteCategory.getCategoryUrl(), e); } logger.info("++++++++++++++++++++++++++++++++++"); logger.info("畅销集合大小:" + (bestSellers == null ? 0 : bestSellers.size())); if (bestSellers != null) { if (bestSellers.size() <= amount) { return bestSellers; } List subList = bestSellers.subList(0, amount); logger.info("----------------------------------"); logger.info("畅销集合大小:" + subList.size()); return subList; } else { return new ArrayList (); } } 下面是解析器
package com.yihaodian.pis.crawler; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.tags.Bullet; import org.htmlparser.tags.BulletList; import org.htmlparser.tags.Div; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.ScriptTag; import org.htmlparser.tags.Span; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import com.yihaodian.pis.dto.BestSellerDto; public class SuningPageParser extends PageParser{ private static final Logger logger = Logger.getLogger(SuningPageParser.class); public SuningPageParser(String html, String charset) { super(html, charset); // TODO Auto-generated constructor stub } @Override public List
extractBestSeller(String bestSellerHtml) throws ParserException { List sellers = new ArrayList (); Parser parser = Parser.createParser(bestSellerHtml, charset); NodeFilter filter = new HasAttributeFilter("class", "product_list02 profix02 clearfix"); NodeList nodeList = parser.extractAllNodesThatMatch(filter); if (nodeList == null || nodeList.size() == 0) { return null; } Node div = nodeList.elementAt(0); NodeList divchildren = div.getChildren(); BulletList ul = (BulletList)divchildren.elementAt(1); NodeList children = ul.getChildren(); BestSellerDto bestSeller = null; for (int i = 0; i < children.size(); i++) { bestSeller = new BestSellerDto(); Node child = children.elementAt(i); if (child instanceof Bullet) { Bullet li = (Bullet) child; Span nameDiv = (Span) findTagByClassName(li, "pro_intro"); //寻找tagName是 LinkTag 的那个 LinkTag link = (LinkTag)findTagByName(nameDiv,"LinkTag"); bestSeller.setName(link.getLinkText()); bestSeller.setUrl("http://www.suning.cn"+link.getLink()); Span pricespan = (Span) findTagByClassName(li, "pro_price"); String priceteString =pricespan.getChildrenHTML().replace("", "").replace("", "").replace("¥", ""); bestSeller.setPrice(priceteString); logger.info("畅销单品:" + bestSeller); sellers.add(bestSeller); } else { continue; } } return sellers; } @Override public String extractNextPageUrlPattern(String bestSellerHtml) throws ParserException { String nextPageUrl=""; Parser parser = Parser.createParser(bestSellerHtml, "utf-8"); NodeFilter filter = new HasAttributeFilter("type", "text/javascript");; NodeList children = parser.extractAllNodesThatMatch(filter); if (children == null || children.size() == 0) { System.out.println("没有值"); }else{ System.out.println("有值"); } for (int i = 0; i < children.size(); i++) { ScriptTag child = (ScriptTag) children.elementAt(i); if(child.findPositionOf("¤tPage=")==0){ String putInCart1 = null; String putInCart2 = null; //Pattern pattern2 = Pattern.compile("(?<=currentPage[)] \\{)([^\\}]*?)(?=\\})"); Pattern pattern2 = Pattern.compile("(?<=var[ \\s]{0,100}(jumpUrl)[\\s]{0,100}[=][\\s]{0,100}[\"])(.*?)(?=\"\\s{0,100}[+])"); Pattern pattern1 = Pattern.compile("(?<=var[ \\s]{0,100}dfy\\s{0,100}=\\s{0,100}[\"])(.*?)(?=[\"][\\s]{0,100})"); Matcher matcher1 = pattern1.matcher(child.getChildrenHTML()); if (matcher1.find()) { putInCart1 = matcher1.group(0).trim(); } Matcher matcher2 = pattern2.matcher(child.getChildrenHTML()); if (matcher2.find()) { putInCart2 = matcher2.group(0).trim(); } //System.out.println(putInCart2.substring(15, putInCart2.indexOf(" + dfy")-1)); nextPageUrl=putInCart2+putInCart1; } } // parser = Parser.createParser(bestSellerHtml, "utf-8"); //得到当前页currentPage // String currentPage=""; // filter = new HasAttributeFilter("class", "on"); // children = parser.extractAllNodesThatMatch(filter); //LinkTag dLinkTag = (LinkTag) children.elementAt(0); //System.out.println(dLinkTag.getLinkText()); // for (int i = 0; i < children.size(); i++) { // Node node =children.elementAt(i); // if (node.getChildren().size()<2) { // LinkTag dLinkTag = (LinkTag)node; // if(dLinkTag.getLink().equals("#")) // currentPage= dLinkTag.getLinkText(); // } // } nextPageUrl+="&ip_sortBy=salevolumn0&sortType=4¤tPage=";// + currentPage; logger.info("畅销榜下一页URL模式:" + nextPageUrl); return nextPageUrl; } @Override public String getNextPageUrl(String nextPageUrlPattern, int pageNum) { StringBuilder sb = new StringBuilder(); sb.append("http://www.suning.cn/webapp/wcs/stores/servlet/"); sb.append(nextPageUrlPattern+(pageNum-1)); String nextPageUrl = sb.toString(); return nextPageUrl; } @Override public String extractName(Map params) throws ParserException { // TODO Auto-generated method stub return null; } @Override public String extractPrice(Map params) throws ParserException { // TODO Auto-generated method stub return null; } @Override public String extractBrand(Map params) throws ParserException { // TODO Auto-generated method stub return null; } @Override public String extractImageUrl(Map params) throws ParserException { // TODO Auto-generated method stub return null; } @Override public boolean hasProduct(Map params) throws ParserException { // TODO Auto-generated method stub return false; } } 此是用来抓取各大网站的目录然后对目录下的商品进行解析得到一个list。
不懂的联系QQ526151410