利用 FileVisitor 来编写搜索工具,需要明确以下几点:
- visitFile() 是用于比较当前文件和搜索条件的最佳地方。在这里,你可以获取当前文件名、文件扩展名、文件属性或者打开文件读取文件内容。这个方法不会搜索目录。
- 如果要搜索目录,必须将比较代码放入 preVisitDirectory() 或 postVisitDirectory() 方法中,至于放入哪个方法,取决于你的需求。
- 如果文件还未搜索出来, visitFileFailed() 需要返回 FileVisitResult.CONTINUE,因为你并不需要让整个搜索过程停止。
- 如果只需要返回一个结果,那么在搜索到结果之后需要在 visitFile() 方法中返回 FileVisitResult.TERMINATE 否则要返回 FileVisitResult.CONTINUE。
- 搜索过程可以将软连接处理为目标文件,但是在递归删除的时候,建议只删除软链接自身。
通过名称搜索
下面代码演示了如何通过文件名称进行搜索。代码中将会在整个默认文件系统中搜索名为 rafa_1.jpg 的文件,并在搜索到结果后停止。
import java.io.IOException; import java.nio.file.FileSystems; import java.nio.file.FileVisitOption; import java.nio.file.FileVisitResult; import java.nio.file.FileVisitor; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.attribute.BasicFileAttributes; import java.util.EnumSet; class Search implements FileVisitor { private final Path searchedFile; public boolean found; public Search(Path searchedFile) { this.searchedFile = searchedFile; this.found = false; } void search(Path file) throws IOException { Path name = file.getFileName(); if (name != null && name.equals(searchedFile)) { System.out.println("Searched file was found: " + searchedFile + " in " + file.toRealPath().toString()); found = true; } } @Override public FileVisitResult postVisitDirectory(Object dir, IOException exc) throws IOException { System.out.println("Visited: " + (Path) dir); return FileVisitResult.CONTINUE; } @Override public FileVisitResult preVisitDirectory(Object dir, BasicFileAttributes attrs) throws IOException { return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFile(Object file, BasicFileAttributes attrs) throws IOException { search((Path) file); if (!found) { return FileVisitResult.CONTINUE; } else { return FileVisitResult.TERMINATE; } } @Override public FileVisitResult visitFileFailed(Object file, IOException exc) throws IOException { //report an error if necessary return FileVisitResult.CONTINUE; } } class Main { public static void main(String[] args) throws IOException { Path searchFile = Paths.get("rafa_1.jpg"); Search walk = new Search(searchFile); EnumSet opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS); Iterabledirs = FileSystems.getDefault().getRootDirectories(); for (Path root : dirs) { if (!walk.found) { Files.walkFileTree(root, opts, Integer.MAX_VALUE, walk); } } if (!walk.found) { System.out.println("The file " + searchFile + " was not found!"); } } }
通过区块匹配搜索
有的时候,你可能只知道部分文件名,那么就可以使用区块匹配功能。下面的代码将演示如何在 C:\rafaelnadal 目录树中查找 *.jpg 的文件。整个目录树搜索完才会停止。
import java.io.IOException; import java.nio.file.FileSystems; import java.nio.file.FileVisitOption; import java.nio.file.FileVisitResult; import java.nio.file.FileVisitor; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.PathMatcher; import java.nio.file.Paths; import java.nio.file.attribute.BasicFileAttributes; import java.util.EnumSet; class Search implements FileVisitor { private final PathMatcher matcher; public Search(String glob) { matcher = FileSystems.getDefault().getPathMatcher("glob:" + glob); } void search(Path file) throws IOException { Path name = file.getFileName(); if (name != null && matcher.matches(name)) { System.out.println("Searched file was found: " + name + " in " + file.toRealPath().toString()); } } @Override public FileVisitResult postVisitDirectory(Object dir, IOException exc) throws IOException { System.out.println("Visited: " + (Path) dir); return FileVisitResult.CONTINUE; } @Override public FileVisitResult preVisitDirectory(Object dir, BasicFileAttributes attrs) throws IOException { return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFile(Object file, BasicFileAttributes attrs) throws IOException { search((Path) file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Object file, IOException exc) throws IOException { //report an error if necessary return FileVisitResult.CONTINUE; } } class Main { public static void main(String[] args) throws IOException { String glob = "*.jpg"; Path fileTree = Paths.get("C:/rafaelnadal/"); Search walk = new Search(glob); EnumSet opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS); Files.walkFileTree(fileTree, opts, Integer.MAX_VALUE, walk); } }
如果你知道文件的更多属性,那么可以编写更复杂的过滤条件。例如,除了文件名外,你可能还知道文件大小小于多少 KB、或者文件的创建时间、文件的最后编辑时间、文件是否只读、文件是否隐藏、文件所有者是谁等。这些都是文件属性中的一部分,下面的代码将搜索按 *.jpg 匹配并且文件大小小于 100 KB 的文件。
import java.io.IOException; import java.nio.file.FileSystems; import java.nio.file.FileVisitOption; import java.nio.file.FileVisitResult; import java.nio.file.FileVisitor; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.PathMatcher; import java.nio.file.Paths; import java.nio.file.attribute.BasicFileAttributes; import java.util.EnumSet; class Search implements FileVisitor { private final PathMatcher matcher; private final long accepted_size; public Search(String glob, long accepted_size) { matcher = FileSystems.getDefault().getPathMatcher("glob:" + glob); this.accepted_size = accepted_size; } void search(Path file) throws IOException { Path name = file.getFileName(); long size = (Long) Files.getAttribute(file, "basic:size"); if (name != null && matcher.matches(name) && size <= accepted_size) { System.out.println("Searched file was found: " + name + " in " + file.toRealPath().toString() + " size (bytes):" + size); } } @Override public FileVisitResult postVisitDirectory(Object dir, IOException exc) throws IOException { System.out.println("Visited: " + (Path) dir); return FileVisitResult.CONTINUE; } @Override public FileVisitResult preVisitDirectory(Object dir, BasicFileAttributes attrs) throws IOException { return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFile(Object file, BasicFileAttributes attrs) throws IOException { search((Path) file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Object file, IOException exc) throws IOException { //report an error if necessary return FileVisitResult.CONTINUE; } } class Main { public static void main(String[] args) throws IOException { String glob = "*.jpg"; long size = 102400; //100 kilobytes in bytes Path fileTree = Paths.get("C:/rafaelnadal/"); Search walk = new Search(glob, size); EnumSet opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS); Files.walkFileTree(fileTree, opts, Integer.MAX_VALUE, walk); } }
通过文件内容进行搜索
文件搜索中比较高级的功能是通过文件内容来进行搜索。你传入一句话或几个单词,返回文件内容中包含这些文本的文件。这种搜索非常消耗时间,因为它需要访问每个文件(每个文件都需要打开、读取、关闭操作)。另外,很多文件格式都支持文本,例如 PDF、
Microsoft Word、 Excel、 PowerPoint、 简单文本文件、 XML、 HTML、 XHTML 等等。这些文件类型的读取各不相同,它们都需要单独的代码来进行内容提取。
下面,我们将开发按文件内容搜索的应用。搜索条件是一系列单词或逗号分隔的句子。例如“Rafael Nadal,tennis,winner of Roland Garros,BNP Paribas tournament draws”。使用 StringTokenizer 类按逗号分隔,将每个单词存入 ArrayList:
… String words="Rafael Nadal,tennis,winner of Roland Garros,BNP Paribas tournament draws"; ArrayListwordsarray = new ArrayList<>(); … StringTokenizer st = new StringTokenizer(words, ","); while (st.hasMoreTokens()) { wordsarray.add(st.nextToken()); }
编写 searchText() 方法,将文件中提取的文本传入,循环前面 ArrayList 依次比较文本是否匹配:
//search text private boolean searchText(String text) { boolean flag = false; for (int j = 0; j < wordsarray.size(); j++) { if ((text.toLowerCase()).contains(wordsarray.get(j).toLowerCase())) { flag = true; break; } } return flag; }
下面将用一组方法来提取不同格式的文件。为了“不重复发明轮子”,将使用第三方包。
搜索 PDF 格式
读取 PDF 内容,可以使用比较流行的 iText 和 Apache PDFBox。这两个包可以到 http://itextpdf.com/ 和 http://pdfbox.apache.org/ 下载。下面的代码基于 iText 5.1.2 和 PDFBox 1.6.0。
//使用 iText 搜索 PDF 文件 boolean searchInPDF_iText(String file) { PdfReader reader = null; boolean flag = false; try { reader = new PdfReader(file); int n = reader.getNumberOfPages(); OUTERMOST: for (int i = 1; i <= n; i++) { String str = PdfTextExtractor.getTextFromPage(reader, i); flag = searchText(str); if (flag) { break OUTERMOST; } } } catch (Exception e) { } finally { if (reader != null) { reader.close(); } return flag; } }
如果你更熟悉 PDFBox,那么可以使用下面代码:
boolean searchInPDF_PDFBox(String file) { PDFParser parser = null; String parsedText = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; boolean flag = false; int page = 0; File pdf = new File(file); try { parser = new PDFParser(new FileInputStream(pdf)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); OUTERMOST: while (page < pdDoc.getNumberOfPages()) { page++; pdfStripper.setStartPage(page); pdfStripper.setEndPage(page + 1); parsedText = pdfStripper.getText(pdDoc); flag = searchText(parsedText); if (flag) { break OUTERMOST; } } } catch (Exception e) { } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) {} return flag; } }
搜索 Microsoft Word、 Excel 和 PowerPoint
Microsoft office 可以用 Apache POI 包来解析,可访问 http://poi.apache.org/ 下载,下面的代码基于 3.7 版本。
处理 word:
boolean searchInWord(String file) { POIFSFileSystem fs = null; boolean flag = false; try { fs = new POIFSFileSystem(new FileInputStream(file)); HWPFDocument doc = new HWPFDocument(fs); WordExtractor we = new WordExtractor(doc); String[] paragraphs = we.getParagraphText(); OUTERMOST: for (int i = 0; i < paragraphs.length; i++) { flag = searchText(paragraphs[i]); if (flag) { break OUTERMOST; } } } catch (Exception e) { } finally { return flag; } }
处理 excel:
boolean searchInExcel(String file) { Row row; Cell cell; String text; boolean flag = false; InputStream xls = null; try { xls = new FileInputStream(file); HSSFWorkbook wb = new HSSFWorkbook(xls); int sheets = wb.getNumberOfSheets(); OUTERMOST: for (int i = 0; i < sheets; i++) { HSSFSheet sheet = wb.getSheetAt(i); Iteratorrow_iterator = sheet.rowIterator(); while (row_iterator.hasNext()) { row = (Row) row_iterator.next(); Iterator
cell_iterator = row.cellIterator(); while (cell_iterator.hasNext()) { cell = cell_iterator.next(); int type = cell.getCellType(); if (type == HSSFCell.CELL_TYPE_STRING) { text = cell.getStringCellValue(); flag = searchText(text); if (flag) { break OUTERMOST; } } } } } } catch (IOException e) { } finally { try { if (xls != null) { xls.close(); } } catch (IOException e) {} return flag; } } |
处理 PPT:
boolean searchInPPT(String file) { boolean flag = false; InputStream fis = null; String text; try { fis = new FileInputStream(new File(file)); POIFSFileSystem fs = new POIFSFileSystem(fis); HSLFSlideShow show = new HSLFSlideShow(fs); SlideShow ss = new SlideShow(show); Slide[] slides = ss.getSlides(); OUTERMOST: for (int i = 0; i < slides.length; i++) { TextRun[] runs = slides[i].getTextRuns(); for (int j = 0; j < runs.length; j++) { TextRun run = runs[j]; if (run.getRunType() == TextHeaderAtom.TITLE_TYPE) { text = run.getText(); } else { text = run.getRunType() + " " + run.getText(); } flag = searchText(text); if (flag) { break OUTERMOST; } } Notes notes = slides[i].getNotesSheet(); if (notes != null) { runs = notes.getTextRuns(); for (int j = 0; j < runs.length; j++) { text = runs[j].getText(); flag = searchText(text); if (flag) { break OUTERMOST; } } } } } catch (IOException e) { } finally { try { if (fis != null) { fis.close(); } } catch (IOException e) {} return flag; } }
搜索文本文件
文本文件(.txt, .html, .xml 等)不需要第三方包,只需要原始的 NIO.2 功能即可完成:
boolean searchInText(Path file) { boolean flag = false; Charset charset = Charset.forName("UTF-8"); try (BufferedReader reader = Files.newBufferedReader(file, charset)) { String line = null; OUTERMOST: while ((line = reader.readLine()) != null) { flag = searchText(line); if (flag) { break OUTERMOST; } } } catch (IOException e) { } finally { return flag; } }
编写一个完整的按内容搜索应用程序
好了,有了以上的基础,我们把所有代码结合起来:
import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfTextExtractor; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.nio.file.FileSystems; import java.nio.file.FileVisitOption; import java.nio.file.FileVisitResult; import java.nio.file.FileVisitor; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.EnumSet; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.poi.hslf.HSLFSlideShow; import org.apache.poi.hslf.model.Notes; import org.apache.poi.hslf.model.Slide; import org.apache.poi.hslf.model.TextRun; import org.apache.poi.hslf.record.TextHeaderAtom; import org.apache.poi.hslf.usermodel.SlideShow; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; class Search implements FileVisitor { ArrayListwordsarray = new ArrayList<>(); ArrayList documents = new ArrayList<>(); boolean found = false; public Search(String words) { wordsarray.clear(); documents.clear(); StringTokenizer st = new StringTokenizer(words, ","); while (st.hasMoreTokens()) { wordsarray.add(st.nextToken().trim()); } } void search(Path file) throws IOException { found = false; String name = file.getFileName().toString(); int mid = name.lastIndexOf("."); String ext = name.substring(mid + 1, name.length()); if (ext.equalsIgnoreCase("pdf")) { found = searchInPDF_iText(file.toString()); if (!found) { found = searchInPDF_PDFBox(file.toString()); } } if (ext.equalsIgnoreCase("doc") || ext.equalsIgnoreCase("docx")) { found = searchInWord(file.toString()); } if (ext.equalsIgnoreCase("ppt")) { searchInPPT(file.toString()); } if (ext.equalsIgnoreCase("xls")) { searchInExcel(file.toString()); } if ((ext.equalsIgnoreCase("txt")) || (ext.equalsIgnoreCase("xml") || ext.equalsIgnoreCase("html")) || ext.equalsIgnoreCase("htm") || ext.equalsIgnoreCase("xhtml") || ext.equalsIgnoreCase("rtf")) { searchInText(file); } if (found) { documents.add(file.toString()); } } //search in text files boolean searchInText(Path file) { boolean flag = false; Charset charset = Charset.forName("UTF-8"); try (BufferedReader reader = Files.newBufferedReader(file, charset)) { String line = null; OUTERMOST: while ((line = reader.readLine()) != null) { flag = searchText(line); if (flag) { break OUTERMOST; } } } catch (IOException e) { } finally { return flag; } } //search in Excel files boolean searchInExcel(String file) { Row row; Cell cell; String text; boolean flag = false; InputStream xls = null; try { xls = new FileInputStream(file); HSSFWorkbook wb = new HSSFWorkbook(xls); int sheets = wb.getNumberOfSheets(); OUTERMOST: for (int i = 0; i < sheets; i++) { HSSFSheet sheet = wb.getSheetAt(i); Iterator row_iterator = sheet.rowIterator(); while (row_iterator.hasNext()) { row = (Row) row_iterator.next(); Iterator
cell_iterator = row.cellIterator(); while (cell_iterator.hasNext()) { cell = cell_iterator.next(); int type = cell.getCellType(); if (type == HSSFCell.CELL_TYPE_STRING) { text = cell.getStringCellValue(); flag = searchText(text); if (flag) { break OUTERMOST; } } } } } } catch (IOException e) { } finally { try { if (xls != null) { xls.close(); } } catch (IOException e) { } return flag; } } //search in PowerPoint files boolean searchInPPT(String file) { boolean flag = false; InputStream fis = null; String text; try { fis = new FileInputStream(new File(file)); POIFSFileSystem fs = new POIFSFileSystem(fis); HSLFSlideShow show = new HSLFSlideShow(fs); SlideShow ss = new SlideShow(show); Slide[] slides = ss.getSlides(); OUTERMOST: for (int i = 0; i < slides.length; i++) { TextRun[] runs = slides[i].getTextRuns(); for (int j = 0; j < runs.length; j++) { TextRun run = runs[j]; if (run.getRunType() == TextHeaderAtom.TITLE_TYPE) { text = run.getText(); } else { text = run.getRunType() + " " + run.getText(); } flag = searchText(text); if (flag) { break OUTERMOST; } } Notes notes = slides[i].getNotesSheet(); if (notes != null) { runs = notes.getTextRuns(); for (int j = 0; j < runs.length; j++) { text = runs[j].getText(); flag = searchText(text); if (flag) { break OUTERMOST; } } } } } catch (IOException e) { } finally { try { if (fis != null) { fis.close(); } } catch (IOException e) { } return flag; } } //search in Word files boolean searchInWord(String file) { POIFSFileSystem fs = null; boolean flag = false; try { fs = new POIFSFileSystem(new FileInputStream(file)); HWPFDocument doc = new HWPFDocument(fs); WordExtractor we = new WordExtractor(doc); String[] paragraphs = we.getParagraphText(); OUTERMOST: for (int i = 0; i < paragraphs.length; i++) { flag = searchText(paragraphs[i]); if (flag) { break OUTERMOST; } } } catch (Exception e) { } finally { return flag; } } //search in PDF files using PDFBox library boolean searchInPDF_PDFBox(String file) { PDFParser parser = null; String parsedText = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; boolean flag = false; int page = 0; File pdf = new File(file); try { parser = new PDFParser(new FileInputStream(pdf)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); OUTERMOST: while (page < pdDoc.getNumberOfPages()) { page++; pdfStripper.setStartPage(page); pdfStripper.setEndPage(page + 1); parsedText = pdfStripper.getText(pdDoc); flag = searchText(parsedText); if (flag) { break OUTERMOST; } } } catch (Exception e) { } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { } return flag; } } //search in PDF files using iText library boolean searchInPDF_iText(String file) { PdfReader reader = null; boolean flag = false; try { reader = new PdfReader(file); int n = reader.getNumberOfPages(); OUTERMOST: for (int i = 1; i <= n; i++) { String str = PdfTextExtractor.getTextFromPage(reader, i); flag = searchText(str); if (flag) { break OUTERMOST; } } } catch (Exception e) { } finally { if (reader != null) { reader.close(); } return flag; } } //search text private boolean searchText(String text) { boolean flag = false; for (int j = 0; j < wordsarray.size(); j++) { if ((text.toLowerCase()).contains(wordsarray.get(j).toLowerCase())) { flag = true; break; } } return flag; } @Override public FileVisitResult postVisitDirectory(Object dir, IOException exc) throws IOException { System.out.println("Visited: " + (Path) dir); return FileVisitResult.CONTINUE; } @Override public FileVisitResult preVisitDirectory(Object dir, BasicFileAttributes attrs) throws IOException { return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFile(Object file, BasicFileAttributes attrs) throws IOException { search((Path) file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Object file, IOException exc) throws IOException { //report an error if necessary return FileVisitResult.CONTINUE; } } class Main { public static void main(String[] args) throws IOException { String words = "Rafael Nadal, tennis, winner of Roland Garros, BNP Paribas tournament draws"; Search walk = new Search(words); EnumSet opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS); Iterable | dirs = FileSystems.getDefault().getRootDirectories(); for (Path root : dirs) { Files.walkFileTree(root, opts, Integer.MAX_VALUE, walk); } System.out.println("____________________________________________________________"); for (String path_string : walk.documents) { System.out.println(path_string); } System.out.println("____________________________________________________________"); } }
文章来源: http://www.aptusource.org/2014/04/nio-2-writing-a-rile-search-application/