try
{
mainPage = webClient.getPage(url);
} catch (Exception e)
{
log.error(e.getMessage(), e);
}
if (mainPage != null)
{
HtmlElement paginationContainer = mainPage
.getElementById("paginationContainer");
if (paginationContainer != null)
{
url = url + "/all-1";
try
{
mainPage = webClient.getPage(url);
} catch (Exception e)
{
log.error(e.getMessage(), e);
}
}
HtmlElement categories = mainPage.getElementById("categories");
if (categories == null)
{
HtmlElement products = mainPage.getElementById("products");
if(products!=null)
{
List<HtmlElement> productDivList = products.getElementsByTagName("div");
if(productDivList!=null&&productDivList.size()>0)
{
for(HtmlElement proDiv:productDivList)
{
List<HtmlElement> subList = proDiv.getElementsByTagName("div");
if(subList!=null)
{
for(HtmlElement dt:subList)
{
String classt = dt.getAttribute("class");
if (classt != null && classt.equals("productImg"))
{
List<HtmlElement> subAList = proDiv.getElementsByTagName("a");
if(subAList!=null&&subAList.size()>0)
{
HtmlElement ae=subAList.get(0);
String href = ae.getAttribute("href");
thirdPageUrls.add(href);
}
break;
}
}
}
}
}
}
}
else
{
List<HtmlElement> divList = categories.getElementsByTagName("div");
if (divList != null && divList.size() > 0)
{
for (HtmlElement div : divList)
{
List<HtmlElement> tempDivs = div
.getElementsByTagName("div");
if (tempDivs != null && tempDivs.size() > 0)
{
for (HtmlElement div1 : tempDivs)
{
List<HtmlElement> aList = div1
.getElementsByTagName("a");
HtmlAnchor a = (HtmlAnchor) aList
.get(0);
String link = a.getHrefAttribute();
secondPageUrls.add(link);
}
}
}
}
}
}
}
}
log.error("第二层抓取结束..........");
log.error("目前抓取到的第二层URL个数为:"+secondPageUrls.size());
int count=0;
for (String url : secondPageUrls)
{
count++;
log.error("正在抓取第二层的第"+count+"个URL:"+url);
HtmlPage mainPage = null;
try
{
mainPage = webClient.getPage(url);
} catch (Exception e)
{
log.error(e.getMessage(), e);
}
log.error("抓取URL完成:"+url+",正在分析URL"+url+"+结果的URL");
if (mainPage != null)
{
HtmlElement paginationContainer = mainPage
.getElementById("paginationContainer");
if (paginationContainer != null)
{
url = url + "/all-1";
}
HtmlElement products = mainPage.getElementById("products");
if (products != null)
{
List<HtmlElement> list = products
.getElementsByTagName("div");
;
if (list == null || list.size() == 0)
{
continue;
}
for (HtmlElement h : list)
{
String cls = h.getAttribute("class");
if (cls == null || !cls.equals("productName"))
{
continue;
}
List<HtmlElement> links = h.getElementsByTagName("a");
if (links != null && links.size() > 0)
{
HtmlAnchor htmlAnchor = (HtmlAnchor) links.get(0);
String linkStr = htmlAnchor.getHrefAttribute();
thirdPageUrls.add(linkStr);
log.error(linkStr);
}
}
}
}
}
log.error("第二层抓取结束..........");
secondPageUrls.clear();
secondPageUrls = null;
count=0;
log.error("目前抓取到的第三层URL个数为:"+thirdPageUrls.size());
String urlPrix="http://www.swarovski-crystallized.com/jewelry/us/";
for (String url : thirdPageUrls)
{
count++;
log.error("正在抓取第三层的第"+count+"个URL:"+url);
HtmlPage mainPage = null;
try
{
mainPage = webClient.getPage(url);
} catch (Exception e)
{
log.error(e.getMessage(), e);
}
if (mainPage != null)
{
log.error("抓取URL完成:"+url+",正在分析URL"+url+"+结果");
int indexC=url.indexOf(urlPrix);
int indexD=url.indexOf("?");
String dirStr=url.substring(indexC+urlPrix.length(), indexD);
String regEx = "/";
// Pattern p = Pattern.compile(regEx);
// Matcher m = p.matcher(dirStr);
//哦哦哦,建立文件夹准备把抓到数据放在里面
dirStr=replece( regEx,"\\\\",dirStr);
dirStr = "D:\\swaroski\\"+dirStr;
File file = new File(dirStr);
if(file.isDirectory())
{
dirStr=dirStr+"\\"+count;
file = new File(dirStr);
}
file.mkdirs();
Product product=new Product();
product.setLocalDir(dirStr);
product.setPageUrl(url);
HtmlElement rightCol = mainPage.getElementById("rightCol");
String title = null;
String description = null;
String packingUnit = null;
if (rightCol != null)
{
HtmlElement headlineDiv = rightCol
.getElementById("headline");
if (headlineDiv != null)
。。。。。。。。。。。。。。。。。。。。。。。。
log.error("完成:"+url+",分析结果");
try
{
swaroSkiDAO.addProduct(product);
} catch (Exception e)
{
log.error(e.getMessage(), e);
}
log.error("完成保存结果");
for(String downloadUrl:resourceUrlList)
{
int index6=downloadUrl.lastIndexOf("/");
String fileName=downloadUrl.substring(index6+1);
String dirStr2=dirStr+"\\"+fileName;
File storeFile = new File(fileName);
if(storeFile.exists())
{
continue;
}
SaveFileThread runable=new SaveFileThread(dirStr2,downloadUrl,sem);
pools.submit(runable);
log.error("开始提交下载文件:"+downloadUrl);
try
{
Thread.sleep(2500);
} catch (InterruptedException e)
{
}
}
//啊哈哈,这个就是去拿下图片和动画的线程,结束
class SaveFileThread implements Runnable
{
private String fileName;
private String downloadUrl;
private Semaphore sem;
public SaveFileThread(String fileName,String downloadUrl,Semaphore sem)
{
this.fileName=fileName;
this.downloadUrl=downloadUrl;
this.sem=sem;
}
public void run()
{
HttpClient client = new HttpClient();
GetMethod get = new GetMethod(downloadUrl);
FileOutputStream output=null;
try
{
client.executeMethod(get);
File storeFile = new File(fileName);
output = new FileOutputStream(storeFile);
output.write(get.getResponseBody());
output.flush();
} catch (Exception e)
{
log.error(e.getMessage(), e);
}