package org.jimmy.studyproject.util; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Pattern; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.model.PicturesTable; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import com.google.common.base.CharMatcher; import com.google.common.collect.Lists; @SuppressWarnings({"resource", "unused"}) public class Utils { public static final String DIR_PATH = "D:/Resume/TopicSolutions/SourceFile/"; public static final String PICTURE_DIR_PATH = "D:/Resume/TopicSolutions/Picture/"; public static String unicodeRegStr = "[a-zA-Z0-9\\u4e00-\\u9fa5"; public static final String PUNCTUATION = "\\u3001,\\u03c1,\\u3002,\\uff08,\\u03c8,\\uff09,\\u300a,\\u300b,\\uff0b,\\uff0c,\\uff0d,\\uff0e,\\u33d1,\\u2014,\\u00d7,\\u2019,\\uff1a,\\u005b,\\uff1c,\\u005d,\\uff1d,\\u221e,\\uff1e,\\uff1f,\\u0060,\\u2264,\\u0028,\\u0029,\\u002b,\\u222b,\\u002d,\\u002e,\\u002f,\\u00b1,\\u03b8,\\u007b,\\u043b,\\u007c,\\u003c,\\u003d,\\u007d,\\u003e"; public static Pattern unicodeReg = null; public static void main(String[] args){ try { String[] punctuationArr = PUNCTUATION.split(","); if(punctuationArr != null){ for(String punctuation : punctuationArr){ unicodeRegStr += punctuation; } } unicodeRegStr += "]"; System.out.println(unicodeRegStr); unicodeReg = Pattern.compile(unicodeRegStr); ListcontextList = readWordFile(); contextList = readSourceWordFile(); writeWordFile(contextList); /*String text = getWordAndStyle(); System.out.println(text);*/ /*boolean flag = UNICODE_REG.matcher("").matches(); System.out.println(flag);*/ } catch (Exception e) { e.printStackTrace(); } } public static void writeWordFile(List contextList){ if(contextList != null){ contextList.forEach(c -> System.out.print(c)); } } //读取每个字符样式 public static String getWordAndStyle() throws Exception{ String text = ""; File dir = new File(DIR_PATH); if(dir != null && dir.isDirectory()){ File[] fileArr = dir.listFiles(); for(File file : fileArr){ CharacterRun cr = null; FileInputStream in = new FileInputStream(file.getAbsolutePath()); HWPFDocument doc = new HWPFDocument(in); int length = doc.characterLength(); PicturesTable pictruesTable = doc.getPicturesTable(); String picFilePath = null; for (int i = 0; i < length; i++){ Range range = new Range(i, i + 1, doc); cr = range.getCharacterRun(0); if(pictruesTable.hasPicture(cr)){ //获取图片路径 picFilePath = readPicture(pictruesTable, cr); } } } } return text; } public static String readPicture(PicturesTable pTable, CharacterRun cr) throws Exception{ Picture pic = pTable.extractPicture(cr, false); String pictureFileName = pic.suggestFullFileName(); String pictureFilePath = PICTURE_DIR_PATH + pictureFileName; OutputStream out = new FileOutputStream(new File(pictureFilePath)); pic.writeImageContent(out); return pictureFilePath; } /** * Detail: 读取源文件,过滤乱码 * Author: ラピスラズリ(Dawn) * Date: 2020年4月22日 下午5:25:17 */ public static List readWordFile() throws Exception { List contextList = Lists.newArrayList(); List contextReplacedList = new ArrayList (); List contextIndividualList = new ArrayList (); File dir = new File(DIR_PATH); if(dir != null && dir.isDirectory()){ File[] fileArr = dir.listFiles(); for(File file : fileArr){ InputStream stream = new FileInputStream(file); String path = file.getAbsolutePath(); if (path.endsWith(".doc")) { HWPFDocument document = new HWPFDocument(stream); WordExtractor extractor = new WordExtractor(document); String[] contextArray = extractor.getParagraphText(); Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context))); extractor.close(); document.close(); } else if (path.endsWith(".docx")) { XWPFDocument document = new XWPFDocument(stream).getXWPFDocument(); List paragraphList = document.getParagraphs(); paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText()))); document.close(); } if(stream != null){ stream.close(); } } } contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", ""))); contextReplacedList.forEach(c -> { for(int i = 0; i < c.length(); i++){ String currentWord = c.charAt(i) + ""; if(unicodeReg.matcher(currentWord).matches()){ contextIndividualList.add(currentWord); } } }); return contextIndividualList; } /** * Detail: 读取源文件,有乱码 * Author: ラピスラズリ(Dawn) * Date: 2020年4月22日 下午5:24:51 */ public static List readSourceWordFile() throws Exception { List contextList = Lists.newArrayList(); List contextReplacedList = new ArrayList (); List contextIndividualList = new ArrayList (); File dir = new File(DIR_PATH); if(dir != null && dir.isDirectory()){ File[] fileArr = dir.listFiles(); for(File file : fileArr){ InputStream stream = new FileInputStream(file); String path = file.getAbsolutePath(); if (path.endsWith(".doc")) { HWPFDocument document = new HWPFDocument(stream); WordExtractor extractor = new WordExtractor(document); String[] contextArray = extractor.getParagraphText(); Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context))); extractor.close(); document.close(); } else if (path.endsWith(".docx")) { XWPFDocument document = new XWPFDocument(stream).getXWPFDocument(); List paragraphList = document.getParagraphs(); paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText()))); document.close(); } if(stream != null){ stream.close(); } } } contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", ""))); contextReplacedList.forEach(c -> { for(int i = 0; i < c.length(); i++){ String currentWord = c.charAt(i) + ""; contextIndividualList.add(currentWord); } }); return contextIndividualList; } }