LZ编码的R语言实现
LZ_code <- function(seq){
seq <- strsplit(seq, split="")[[1]]
#储存对字符串分段
split_part <- c(NA)
code <- ""
pre <- FALSE
#记录分段前缀的索引
pre_parts <- c(NA)
pre_part <- 1
#对于有任意字符的字符串,用species储存单字符的种类
species <- c()
for(i in 1:length(seq)){
char <- seq[i]
if(!(char %in% species)){
species <- c(species, char)
}
code <- paste(code, char)
#查找已储存的段是否与新的段相同
for(j in split_part[-1]){
#若有相同的段,则加入下一个字符
#并且不能是最后一个字符
if(code == j & i != length(seq)){
pre <- TRUE
pre_part <- which(split_part == j)
break
}
#否则储存为新段
pre <- FALSE
}
if(!pre){
split_part <- c(split_part, code)
#储存该段前缀的索引
pre_parts <- c(pre_parts, pre_part)
code <- ""
pre_part <- 1
}
}
#给段号编码
part_code <- c()
#首先得到段号码长
part_length <- ceiling(log2(length(split_part)))
for(i in 1:length(split_part)){
part_code <- c(part_code, substring(paste(rev(as.integer(intToBits(i-1))),
collapse=""),
(33-part_length), 32))
}
#给基础字符编码
cele_code_length <- ceiling(log2(length(species)))
cele_code <- c()
for(i in 1:length(species)){
cele_code <- c(cele_code, substr(paste(rev(as.integer(intToBits(i-1))),
collapse=""),
(33-cele_code_length), 32))
}
#储存编码
string <- ""
#开始给各段编码
for(i in 2:length(split_part)){
str <- split_part[i]
for(j in 1:length(species)){
if(substr(str, nchar(str), nchar(str))==species[j]){
string <- paste(string, part_code[pre_parts[i]], cele_code[j], sep="")
break
}
}
}
#输出编码后的序列
return(string)
}
seq <- "ADDCFDFDCFH"
LZ_code(seq)