问题是一个很大的性能问题,如果我们比较了size一样,那么我们是不是还要把流去比较md5,这很是浪费,最后的做法是自己用modify time作为比较依据:
import java.io._
import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.slf4j.LoggerFactory
/** * Created by todd.chen on 16/3/22. * email : todd.chen@ximalaya.com */
object PathSyncer {
lazy val logger = LoggerFactory.getLogger(this.getClass)
@throws(classOf[IOException])
def sync(file: File, path: Path, fileFilterOps: Option[FileFilter] = None)(implicit fileSystem: FileSystem): Unit = {
if (fileFilterOps.isEmpty) {
if (!initPath(file, path)) {
syncChildren(file, path)
}
} else {
if (!initPath(file, path, fileFilterOps.get)) {
syncChildren(file, path, fileFilterOps)
}
}
}
private def syncChildren(file: File, path: Path, fileFilterOps: Option[FileFilter] = None)
(implicit fileSystem: FileSystem): Unit = {
(file.isFile, fileSystem.isFile(path)) match {
case (true, true) ⇒ syncFile(file, path, Map[String, Long]().empty)
case (true, false) ⇒ throw new IllegalArgumentException("")
case (false, true) ⇒ throw new IllegalArgumentException("")
case (false, false) ⇒ if (fileFilterOps.isEmpty) {
syncDir(file, path)
} else {
syncDir(file, path, fileFilterOps.get)
}
}
}
private def initPath(file: File, path: Path)(implicit fileSystem: FileSystem): Boolean = {
if (!fileSystem.exists(path)) {
logger.debug("{} not exists ,copy from local", path.toString)
fileSystem.copyFromLocalFile(false, true, new Path(file.toURI), path)
initTime(file, path, None)
true
} else false
}
private def initPath(file: File, path: Path, fileFilter: FileFilter)
(implicit fileSystem: FileSystem): Boolean = {
if (!fileSystem.exists(path)) {
logger.debug("{} not exists ,copy from local", path.toString)
val filterLocal = file.listFiles(fileFilter)
filterLocal.foreach(local ⇒ fileSystem.copyFromLocalFile(false, true, new Path(local.toURI),
new Path(path, local.getName)))
initTime(file, path, Some(fileFilter))
true
} else false
}
@throws(classOf[IOException])
private def syncDir(file: File, path: Path)(implicit fileSystem: FileSystem): Unit = {
//sync file step 1 delete diff dir in hdfs
val needDeleteList = needDelete(file, path, None)
needDeleteList.foreach(child ⇒ fileSystem.delete(new Path(path, child), true))
var hdfsMap = Map[String, Long]().empty
val hdfsFile = fileSystem.listStatus(path).filter(_.isFile)
hdfsFile.foreach(hdfs ⇒ hdfsMap += (hdfs.getPath.getName → hdfs.getModificationTime))
//sync file step2 local sync this dir's file to hdfs
val (listDirLocal, listFileLocal) = file.listFiles().partition(_.isDirectory)
listFileLocal.foreach(localFile ⇒ syncFile(localFile, new Path(path, localFile.getName), hdfsMap))
listDirLocal.foreach(localDir ⇒ syncChildren(localDir, new Path(path, localDir.getName)))
}
private def syncDir(file: File, path: Path, fileFilter: FileFilter)
(implicit fileSystem: FileSystem): Unit = {
val localFilter = file.listFiles(fileFilter)
val needDeleteList = needDelete(file, path, Some(fileFilter))
needDeleteList.foreach(child ⇒ fileSystem.delete(new Path(path, child), true))
val (localDir, localFile) = localFilter.partition(_.isDirectory)
var hdfsMap = Map[String, Long]().empty
val hdfsFile = fileSystem.listStatus(path).filter(_.isFile)
hdfsFile.foreach(hdfs ⇒ hdfsMap += (hdfs.getPath.getName → hdfs.getModificationTime))
localFile.foreach(childFile ⇒ syncFile(childFile, new Path(path, childFile.getName), hdfsMap))
localDir.foreach(childDir ⇒ syncChildren(childDir, new Path(path, childDir.getName)))
}
private def needDelete(localFile: File, hdfsPath: Path, fileFilterOps: Option[FileFilter])(implicit fileSystem: FileSystem) = {
// require(fileSystem.isDirectory(hdfsPath) && localFile.isDirectory)
if (!fileSystem.exists(hdfsPath)) {
fileSystem.mkdirs(hdfsPath)
}
if (fileFilterOps.nonEmpty) {
fileSystem.listStatus(hdfsPath)
.map(_.getPath.getName)
.diff(localFile.listFiles(fileFilterOps.get).map(_.getName)).toList
} else {
fileSystem.listStatus(hdfsPath)
.map(_.getPath.getName)
.diff(localFile.listFiles().map(_.getName)).toList
}
}
private def syncFile(file: File, path: Path, map: Map[String, Long])(implicit fileSystem: FileSystem): Unit = {
if (!sameFile(file, path, map)) {
fileSystem.copyFromLocalFile(false, true, new Path(file.toURI), path)
syncFileTime(file, path)
}
}
private def sameFile(file: File, path: Path, map: Map[String, Long])(implicit fileSystem: FileSystem): Boolean = {
if (fileSystem.exists(path) &&
file.isFile && map.contains(path.getName)) {
if (file.lastModified() == map.getOrElse(path.getName, 0)) {
true
} else {
getHdfsFileMd5(path) == getLocalFileMd5(file)
}
} else false
}
private def initTime(file: File, path: Path, fileFilterOps: Option[FileFilter])(implicit fileSystem: FileSystem): Unit = {
if (fileSystem.isFile(path)) {
syncFileTime(file, path)
} else {
syncDirTime(file, path, fileFilterOps)
}
}
private def syncFileTime(file: File, path: Path)(implicit fileSystem: FileSystem): Unit = {
val localModifyTime = file.lastModified()
fileSystem.setTimes(path, localModifyTime, -1)
}
private def syncDirTime(file: File, path: Path, fileFilterOps: Option[FileFilter] = None)
(implicit fileSystem: FileSystem): Unit = {
require(file.isDirectory && fileSystem.isDirectory(path))
val (localChildrenFile, localChildrenDir) = fileFilterOps match {
case Some(filter) ⇒ file.listFiles(filter).partition(_.isFile)
case None ⇒ file.listFiles().partition(_.isFile)
}
localChildrenFile.foreach(childFile ⇒ syncFileTime(childFile, new Path(path, childFile.getName)))
localChildrenDir.foreach(childDir ⇒ syncDirTime(childDir, new Path(path, childDir.getName)))
}
@throws(classOf[IOException])
private[sync] def getHdfsFileMd5(path: Path)(implicit dfs: FileSystem): String = {
val in = dfs.open(path)
try {
val md5 = DigestUtils.md5Hex(new BufferedInputStream(in))
md5
} finally {
in.close()
}
}
@throws(classOf[IOException])
private[sync] def getLocalFileMd5(file: File): String = {
val in = new FileInputStream(file)
try {
val md5 = DigestUtils.md5Hex(new BufferedInputStream(in))
md5
} finally {
in.close()
}
}
}
mygithub