需求是一个每天读取单个文件在1个G到3G之间,数据量在400万左右,读取完成后,调用谷歌翻译成中文后,再保存到数据库
实现方面采用多线程,RandomAccessFile读取,缓冲区分流,其中还用到了线程的一些并发变量,例如cyclicBarrier,
AtomicInteger等,读取完成后会跟redis交互,因为每天会有重复数据,所以这里跟redis做对比过滤,调用谷歌API翻译部分,
采用了分段提交,如每组100条记录调用一次API,同时为了解决并发频繁调用API(这里被谷歌限流403错误),采用了并发框架guava,如果是分布式系统,建议采用redis解决方案,关于并发限流算法,大家可以网上看一看,最后保存入库,目前公司都是用的JPA,hibernate,其中实体类用了version乐观锁,导致并发时报了乐观锁错误,如果对数据要求不高,可以忽略,我这里是去掉了version字段,下面贴上完整代码,主要是3个类。
package com.hilton.hcs.china.ari.file; import org.springframework.stereotype.Service; @Service public interface IHandle { public void handle(String line); }
package com.hilton.hcs.china.ari.file; import java.io.*; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel.MapMode; import java.security.InvalidParameterException; import java.util.HashSet; import java.util.Set; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicLong; public class BigFileReader { private int threadSize; private String charset; private int bufferSize; private IHandle handle; private ExecutorService executorService; private long fileLength; private RandomAccessFile rAccessFile; private SetstartEndPairs; private CyclicBarrier cyclicBarrier; private AtomicLong counter = new AtomicLong(0); private BigFileReader(File file, IHandle handle, String charset, int bufferSize, int threadSize) { this.fileLength = file.length(); this.handle = handle; this.charset = charset; this.bufferSize = bufferSize; this.threadSize = threadSize; try { this.rAccessFile = new RandomAccessFile(file, "r"); } catch (FileNotFoundException e) { e.printStackTrace(); } this.executorService = Executors.newFixedThreadPool(threadSize); startEndPairs = new HashSet (); } public void start() { long everySize = this.fileLength / this.threadSize; try { calculateStartEnd(0, everySize); } catch (IOException e) { e.printStackTrace(); return; } final long startTime = System.currentTimeMillis(); cyclicBarrier = new CyclicBarrier(startEndPairs.size(), new Runnable() { @Override public void run() { System.out.println("use time: " + (System.currentTimeMillis() - startTime)); System.out.println("all line: " + counter.get()); } }); for (StartEndPair pair : startEndPairs) { System.out.println("分配分片:" + pair); this.executorService.execute(new SliceReaderTask(pair)); } } private void calculateStartEnd(long start, long size) throws IOException { if (start > fileLength - 1) { return; } StartEndPair pair = new StartEndPair(); pair.start = start; long endPosition = start + size - 1; if (endPosition >= fileLength - 1) { pair.end = fileLength - 1; startEndPairs.add(pair); return; } rAccessFile.seek(endPosition); byte tmp = (byte) rAccessFile.read(); while (tmp != '\n' && tmp != '\r') { endPosition++; if (endPosition >= fileLength - 1) { endPosition = fileLength - 1; break; } rAccessFile.seek(endPosition); tmp = (byte) rAccessFile.read(); } pair.end = endPosition; startEndPairs.add(pair); calculateStartEnd(endPosition + 1, size); } public void shutdown() { try { this.rAccessFile.close(); } catch (IOException e) { e.printStackTrace(); } this.executorService.shutdown(); } private void handle(byte[] bytes) throws UnsupportedEncodingException { String line = null; if (this.charset == null) { line = new String(bytes); } else { line = new String(bytes, charset); } if (line != null && !"".equals(line)) { this.handle.handle(line); counter.incrementAndGet(); } } private static class StartEndPair { public long start; public long end; @Override public String toString() { return "star=" + start + ";end=" + end; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + (int) (end ^ (end >>> 32)); result = prime * result + (int) (start ^ (start >>> 32)); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; StartEndPair other = (StartEndPair) obj; if (end != other.end) return false; if (start != other.start) return false; return true; } } private class SliceReaderTask implements Runnable { private long start; private long sliceSize; private byte[] readBuff; public SliceReaderTask(StartEndPair pair) { this.start = pair.start; this.sliceSize = pair.end - pair.start + 1; this.readBuff = new byte[bufferSize]; } @Override public void run() { try { MappedByteBuffer mapBuffer = rAccessFile.getChannel().map(MapMode.READ_ONLY, start, this.sliceSize); ByteArrayOutputStream bos = new ByteArrayOutputStream(); for (int offset = 0; offset < sliceSize; offset += bufferSize) { int readLength; if (offset + bufferSize <= sliceSize) { readLength = bufferSize; } else { readLength = (int) (sliceSize - offset); } mapBuffer.get(readBuff, 0, readLength); for (int i = 0; i < readLength; i++) { byte tmp = readBuff[i]; if (tmp == '\n' || tmp == '\r') { handle(bos.toByteArray()); bos.reset(); } else { bos.write(tmp); } } } if (bos.size() > 0) { handle(bos.toByteArray()); } cyclicBarrier.await();//等待其它线程执行完 } catch (Exception e) { e.printStackTrace(); } } } public static class Builder { private int threadSize = 1; private String charset = null; private int bufferSize = 1024 * 1024; private IHandle handle; private File file; public Builder(String file, IHandle handle) { this.file = new File(file); if (!this.file.exists()) throw new IllegalArgumentException("The path can not be null or empty"); this.handle = handle; } public Builder withTreahdSize(int size) { if (size < 1) { throw new InvalidParameterException("The threadCount can not be less than 1"); } this.threadSize = size; return this; } public Builder withCharset(String charset) { this.charset = charset; return this; } public Builder withBufferSize(int bufferSize) { this.bufferSize = bufferSize; return this; } public BigFileReader build() { return new BigFileReader(this.file, this.handle, this.charset, this.bufferSize, this.threadSize); } } }
package com.hilton.hcs.china.ari.file; import com.google.common.util.concurrent.RateLimiter; import com.google.gson.Gson; import com.hilton.hcs.china.ari.model.TransData; import com.hilton.hcs.china.ari.redis.RedisClientTemplate; import com.hilton.hcs.china.ari.service.AbstractService; import com.hilton.hcs.china.ari.utils.StringUtil; import com.hilton.hcs.data.content.ShopPropertyInfo; import com.hilton.hcs.data.content.ShopPropertyInfoRepository; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.HttpMethod; import org.springframework.http.ResponseEntity; import org.springframework.stereotype.Service; import java.io.File; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; @Service public class HiltonTransFile extends AbstractService { private static volatile int fileType = 0; private static AtomicInteger count = new AtomicInteger(0); private static StringBuilder stringBuilder = null; @Autowired private RedisClientTemplate redisClientTemplate; public void transFile(ShopPropertyInfoRepository shopPropertyInfoRepository) { // FileInputStream input = null; try { // FTPUtil ftpUtil = new FTPUtil(); // String absPath = "/home/ftpuser/hilton/"; // String localPath = "C:\\Users\\DT302\\Desktop\\storeFile"; // FTPFile[] ftpFiles = ftpUtil.getFtpClient().listFiles(absPath); // if (null != ftpFiles && ftpFiles.length > 0) { // for (FTPFile ftpFile : ftpFiles) { // input = new FileInputStream(absPath + ftpFile.getName()); // ftpUtil.getFtpClient().storeFile(ftpFile.getName(), input); // } // // } File filedir = new File("C:\\Users\\DT302\\Desktop\\翻译\\"); if (!filedir.exists()) { throw new Exception("file dir not exists"); } else { for (File file : filedir.listFiles()) { if (file.isDirectory()) { continue; } if (file.getName().contains("srp")) { fileType = 1; //价格 } else { fileType = 0; //房型 } BigFileReader.Builder builder = new BigFileReader.Builder("C:\\Users\\DT302\\Desktop\\翻译\\" + file.getName(), new IHandle() { private volatile ListshopPropertyInfos = new ArrayList<>(); private volatile List trans = new ArrayList<>(); private volatile ShopPropertyInfo shopPropertyInfo = null; private volatile List cutList = null; private volatile List cutBeanList = null; RateLimiter limiter = RateLimiter.create(50.0); // 限流 @Override public synchronized void handle(String line) { try { String arr[] = line.split("\\|"); if (fileType == 0 || arr.length < 123) { String propCode = arr[0]; String roomTypeCode = arr[1]; String sourceDesc = arr[14]; String sourceDescRedis = redisClientTemplate.get(propCode + "_" + roomTypeCode); if (StringUtil.isEmpty(sourceDescRedis) || !sourceDesc.equals(sourceDescRedis)) { shopPropertyInfo = new ShopPropertyInfo(); shopPropertyInfo.setPropPode(propCode); shopPropertyInfo.setRoomTypeCode(roomTypeCode); shopPropertyInfo.setSourceDesc(sourceDesc); shopPropertyInfos.add(shopPropertyInfo); trans.add(sourceDesc.toLowerCase()); redisClientTemplate.set(propCode + "_" + roomTypeCode, sourceDesc); } } else { stringBuilder = new StringBuilder(); String propCode = arr[3]; String srpCode = arr[2]; int j = 0; for (int i = 113; i < 123; i++) { if (arr.length > 123 && !StringUtil.isEmpty(arr[i])) { String sourceDesc = arr[i].toLowerCase(); String sourceDescRedis = redisClientTemplate.get(propCode + "_" + srpCode + "_" + j); if (StringUtil.isEmpty(sourceDescRedis) || !sourceDesc.equals(sourceDescRedis)) { shopPropertyInfo = new ShopPropertyInfo(); shopPropertyInfo.setPropPode(propCode); shopPropertyInfo.setRatePlanCode(srpCode); shopPropertyInfo.setSourceDesc(sourceDesc); trans.add(sourceDesc); shopPropertyInfos.add(shopPropertyInfo); redisClientTemplate.set(propCode + "_" + srpCode + "_" + j, sourceDesc); } } j++; } } limiter.acquire(); // 请求RateLimiter, 超过permits会被阻塞 transToDB(); } catch (Exception e) { e.printStackTrace(); } // increat(); System.out.println("=========line is " + count.addAndGet(1)); } private void transToDB() { //分段执行谷歌批量翻译 int flag = 100;//每次取的数据 int size = trans.size(); int temp = size / flag + 1; boolean special = size % flag == 0; for (int i = 0; i < temp; i++) { if (null == trans || trans.size() > 1) { continue; } if (i == temp - 1) { if (special) { break; } cutList = trans.subList(flag * i, size); cutBeanList = shopPropertyInfos.subList(flag * i, size); } else { cutList = trans.subList(flag * i, flag * (i + 1)); cutBeanList = shopPropertyInfos.subList(flag * i, flag * (i + 1)); } // String result = transByGoole("en", "zh-CN", cutList); // TransData transData = new Gson().fromJson(result, TransData.class); // if (null != transData && null != transData.getData()) { //入库 if (null != cutList && cutList.size() > 0) { for (int j = 0; j < cutList.size(); j++) { ShopPropertyInfo shopPropertyInfo = cutBeanList.get(j); shopPropertyInfo.setDescription("暂无"); if (!StringUtil.isEmpty(shopPropertyInfo.getRoomTypeCode())) { //j随机数,主要用于查询shop,redis获取list,同时解决ratePlanCode覆盖问题 if (!StringUtil.isEmpty(shopPropertyInfo.getDescription())) { redisClientTemplate.set(shopPropertyInfo.getPropPode() + "_" + shopPropertyInfo.getRoomTypeCode() + "_zh" + j, shopPropertyInfo.getDescription()); } } else { if (!StringUtil.isEmpty(shopPropertyInfo.getDescription())) { redisClientTemplate.set(shopPropertyInfo.getPropPode() + "_" + shopPropertyInfo.getRatePlanCode() + "_zh" + j, shopPropertyInfo.getDescription()); } } shopPropertyInfo.setCreatedTime(new Date()); shopPropertyInfoRepository.save(shopPropertyInfo); } //清空(操作的是原list) cutList.clear(); cutBeanList.clear(); } } // } } }); //Runtime.getRuntime().availableProcessors() 获取可用线程数 builder.withTreahdSize(Runtime.getRuntime().availableProcessors()).withCharset("gbk") .withBufferSize(1024 * 1024); //设置读取缓冲区大小 BigFileReader bigFileReader = builder.build(); bigFileReader.start(); } } } catch (Exception e) { e.printStackTrace(); } finally { try { // input.close(); } catch (Exception e) { e.printStackTrace(); } } } public String transByGoole(String source, String target, List trans) { //谷歌翻译 Map params = new HashMap<>(); params.put("q", trans); params.put("target", target); params.put("format", "text"); params.put("source", source); params.put("model", "nmt"); ResponseEntity googleResponse = exchange("https://translation.googleapis.com/language/translate/v2?key=yourKey", HttpMethod.POST, params, String.class, null); return googleResponse.getBody() + ""; } public static void main(String args[]) { String body = new HiltonTransFile().transByGoole("", "", null); System.out.println(new Gson().fromJson(body, TransData.class).getData().getTranslations().get(0).getTranslatedText()); } }