递归读取heritrix 爬下来的目录文件
1. 在StringUtil中获取写一个根据路径递归的方法
public ArrayList<String> getAllPath(Stringpath){
File file = newFile(path);
File[] fileList = file.listFiles();
for(File f :fileList){
if(f.isDirectory()){
getAllPath(f.getAbsolutePath());
}else{
results.add(f.getAbsolutePath());
}
}
return results;
}
2. 测试:
@Test
public voidgetAllPathTest(){
String path = "F:\\lucene\\myheritrix\\jobs\\output\\mirror\\news.sohu.com";
ArrayList<String> lists = newStringUtil().getAllPath(path);
for(String s: lists){
System.out.println(s);
}
System.out.println(lists.size());
}
3. 整合起来,每次读取内容
public voidpaserSohuNews2(String inputPath,String outputPath){
File outputfile = newFile(outputPath);
try {
if(!outputfile.exists()){
outputfile.mkdirs();
}
ArrayList<String> pathLists = newStringUtil().getAllPath(inputPath);
for(Stringpath: pathLists){
if(!path.contains(".shtml")) continue;
String html = StringUtil.getContent(path);
//System.out.println(html);
String content = StringUtil.getContentUseParse2(html,"gbk", "h1","itemprop","headline");
content += ConstantString.WIN_NextLine+StringUtil.getContentUseParse2(html,"gbk", "div", "itemprop", "datePublished");
System.out.println(content);
content += ConstantString.WIN_NextLine+StringUtil.getContentUseParse2(html,"gbk", "div", "itemprop", "articleBody");
String name = StringUtil.getNameFromPath(path);
File outputFile = newFile(outputPath+"/"+name+ConstantString.postText);
if(!outputFile.exists()){
outputFile.createNewFile();
}
//System.out.println("outputpath:"+outputPath);
//System.out.println("outputFile:"+outputFile);
OutputStreamWriter ot = new OutputStreamWriter(newFileOutputStream(outputFile));
ot.write(content);
ot.close();
//System.out.println("---------------------------------------");
}