print(sprintf("something : %d", i) # 不要漏了print,不然不输出
使用print则转义字符失效,建议使用cat。注意print有自动换行,cat没有。(但print可以打印各种格式,包括list;而cat好像不能打印list,可能是只能打印字符串,本人暂未查证)
因此,含转义字符的字符串用cat;
其他数据类型如list用print
> print("1\t2")
[1] "1\t2"
> cat("1\t2")
1 2
tolower(data$gender)
print(class(data))
判断ori_data[,1]中是否存在元素a:
a %in% ori_data[,1]
如果存在返回 true,否则返回 false
> frame = data4_trans[1:3,1:3]
> frame
RAB4B C12orf5 RNF44
TCGA.AA.3850.01 "7.246805e+04" "1.580843e+05" "3.868779e+05"
TCGA.AA.3844.01 "6.406208e+04" "2.195557e+05" "5.231949e+05"
TCGA.CK.5913.01 "3.165322e+04" "1.168961e+05" "3.187141e+05"
> i=1
> j=2
> frame[i,j]
[1] "1.580843e+05"
> frame[i,]
RAB4B C12orf5 RNF44
"7.246805e+04" "1.580843e+05" "3.868779e+05"
> frame[,j]
TCGA.AA.3850.01 TCGA.AA.3844.01 TCGA.CK.5913.01
"1.580843e+05" "2.195557e+05" "1.168961e+05"
> frame[1:2,j]
TCGA.AA.3850.01 TCGA.AA.3844.01
"1.580843e+05" "2.195557e+05"
frame["RAB4B","TCGA.AA.3850.01"] # 按行列名读元素
frame["RAB4B",] # 按行名读行
frame[,"TCGA.AA.3850.01"] #按列名读列
frame$TCGA.AA.3850.01 #按列名读列
frame[c("RAB4B","RNF44"),] #按行名读一些行
frame$TCGA.AA.3850.01
> nrow(frame)
[1] 3
> ncol(frame)
[1] 3
> dim(frame)
[1] 3 3
> dim(frame)[1]
[1] 3
> dim(frame)[2]
[1] 3
> names(frame) # 列名 (如果是一维的话就是元素名)
[1] "TCGA.AA.3850.01" "TCGA.AA.3844.01" "TCGA.CK.5913.01"
> colnames(frame)# 列名
[1] "TCGA.AA.3850.01" "TCGA.AA.3844.01" "TCGA.CK.5913.01"
> rownames(frame) # 行名
[1] "RAB4B" "C12orf5" "RNF44"
> row.names(frame) # 行名
[1] "RAB4B" "C12orf5" "RNF44"
colnames(frame) = names
rownames(frame) = names
names(result)<-NULL
names(CMS_result)<-NULL
rownames(dataset) <- NULL
frame[-1,] # 删掉第一行
data =data[ , -which(colnames(data) %in% c("Primary.Tumor.Site","Stage"))] # 根据列名删除
data$Cecum = as.numetric(data$Primary.Tumor.Site == "Cecum")
save(data4_trans,file="trans.Rdata") # 保存为二进制文件,读写都快
load("trans.Rdata") # 这样就会有个data4_trans变量
dataset = data4_trans
dataset = data.frame(array(as.numeric(dataset),dim=dim(dataset))) # 转为numeric
dataset = data.frame(array(as.numeric(dataset),dim=(m,n))) # 重构为m行n列
append;
indeces = c()
indeces = c(indeces,i)
长度:
length(vec)
元素名:
names(vec)
直接 as.numeric()
data$gender = (tolower(data$gender) == "male" )
data$gender = as.numeric(data$gender)
index = regexpr("[0-9]+-[\\s]*", str)[1] # 返回第一个参数在第二个参数str中首次出现的index。(后面是[1] ,否则还有其他一些值可以输出)
indeces = grep("_Rep", colnames(data1_no_date_rows))
which的用法,逻辑矩阵做下标
只要是分隔符和换行符分开的,不是csv后缀也行。读出来的格式是data.frame
# 用csv的方式读取原始数据:(默认第一行为列名)
RNAFile = "mRNA_FPKM_UQ.txt"
#result = readLines(RNAFile)
data <- read.csv(RNAFile, encoding="UTF-8",sep="\t")
print(class(data)) # data.frame
# -*-coding:utf-8-*-
# Title : main.R
# Objective : learning R language from nothing
# Created by: 19391
# Created on: 2021/5/10
# 用csv的方式读取原始数据:(默认第一行为列名)
RNAFile = "mRNA_FPKM_UQ.txt"
#result = readLines(RNAFile)
data <- read.csv(RNAFile, encoding="UTF-8",sep="\t")
print(class(data))
# --------------------- 查找重复行 -------------------------------
# 检验是否有重复行可以使用表格,即一个数据帧,其中包含ids列表及其发生次数。
n_occur <- data.frame(table(data$Tag)) #n_occur <- data.frame(table(data[,1]))
#print(n_occur)
n_occur[n_occur$Freq > 1,] #告诉你哪些id发生了多次。
# which(data$Tag %in% n_occur$Var1[n_occur$Freq > 1]) # 重复行的行号 # 10352 10650 15806 15893
# data[data$Tag %in% n_occur$Var1[n_occur$Freq > 1],] #返回具有多个出现的记录。
# 结果发现有两个行名出现了两次,还有一些日期行,这些日期名显然不是基因名,考虑删除所有日期行
# -------------------- 删除日期行 -----------------------
# 删除Tag为日期的行 (用正则表达式,找开头是数字的)
lastnum = nrow(data)
indeces = c()
for(i in c(1:lastnum)){
str = data$Tags[i]
index = regexpr("[0-9]+-[\\s]*", str)[1]
if(index == 1){
indeces = c(indeces,i)
print(sprintf("%d: %d, %s",i, index, data$Tags[i]))
}
}
print(indeces)
print(sprintf("There are %d rows wth date tags", length(indeces)))
data1_no_date_rows = data[-indeces,]
print(dim(data1_no_date_rows))
# 确认一下
deletedrows = length(indeces)
leftrows = dim(data1_no_date_rows)[1]
print(sprintf("deleted rows + left rows = %d + %d = %d", deletedrows,leftrows, deletedrows+leftrows))
print(sprintf("original rows = %d", dim(data)[1]))
# ------------- 提取第一列作为行名,折哟昂所有数据就都是数值了 ------------------
rownames(data1_no_date_rows)<-data1_no_date_rows[,1] #将数据框的第一列作为行名
data1_no_date_rows<-data1_no_date_rows[,-1] #将数据框的第一列删除,只留下剩余的列作为数据
#head(data1_no_date_rows) #这一句可能不需要# 这一行不知道是干啥的by Qiao
print(data1_no_date_rows[1:5,1:5])
# ------------------------- 删除重复列 ----------------
#提取用户id包含_Rep的列:使用grep函数,可以返回包含子串的id
indeces = grep("_Rep", colnames(data1_no_date_rows))
print(indeces)
#打印一下确认一下这些列名都包含_Rep
print(data1_no_date_rows[0,indeces])
##选择这些列:
#rep_cols <- data[c(grep("_Rep",colnames(data)))]
#print(rep_cols[1:5,])
#删除这些重复列:
data2_no_rep = data1_no_date_rows[-c(grep("_Rep", colnames(data1_no_date_rows)))]
print(dim(data2_no_rep))
print(dim(data1_no_date_rows)) # 19726 RNA names, 459 people
print(length(indeces))
# --------------处理NA或非数值型的数据----------------
# 处理NA
anyNA(data2_no_rep) #看看有没有NA的,结果是有
temp = is.na(data2_no_rep)# true false matrix # very fast
print(length(which(temp))) # num of NA # very fast
print(data2_no_rep[temp]) # super slow
print(which2D(temp)) # 548 458
print(data2_no_rep[547:550,457:458]) # 打印一看就是NA
data2_no_rep[temp] = 0 # replace NA with 0
# 检查一下
print(data2_no_rep[547:550,457:458])
anyNA(data2_no_rep) # confirm: there's no NA now
# 处理非数值的值
nonNumericInFrame <- function (frame){
test = as.matrix(frame)
test = as.numeric(test)
logiVec = which(is.na(test))
i = as.integer(logiVec %% nrow(frame))
j = as.integer(floor(logiVec / nrow(frame))+1)
(cbind(i,j))
}
(result = nonNumericInFrame(data2_no_rep))
print(result) # 很快
data2_no_rep[result] # slow 建议手打 (不合理啊。。没理由这个很慢啊。。。。。。
data2_no_rep[548,53] # 很快 "91.8737221\001\b\020\001 \b\020\003\b\020399041"
data2_no_rep[548,53] = 91.8737221 # 目测应该是这样吧,再测一遍
result = nonNumericInFrame(data2_no_rep)
print(result)
#test = t(data2_no_rep)
#anyNA(test)
#-----------------除去在50%以上样本表达值都为0 的基因, 非常慢------------------
del_count = 0
delIndecies = c()
for(i in c(1:nrow(data2_no_rep))){
print(sprintf("%dth row", i))
temp <- data2_no_rep[i,]==0
#print(sprintf('%dth col: %d', i,length(which(temp==TRUE))))
if(length(which(temp==TRUE)) > ncol(data2_no_rep)/2){
del_count = del_count+1
delIndecies = c(delIndecies,i)
}
}
print(del_count)
#print(delIndecies)
print(length(delIndecies))
data3_remove0 = data2_no_rep[-delIndecies,]
print(dim(data3_remove0)[1]) #16970 459
print(dim(data2_no_rep)[1])
print(dim(data3_remove0)[1] + del_count)
save(data3_remove0,file = "beforeSd.Rdata") # remember to have this "file="
#-----------------计算每个基因表达值的标准差,通过除去标准差低于0.3 的基因来过滤低表达差异性的基因
sdResult = apply(data3_remove0, 1, sd) # 20s
if(anyNA(sdResult)){
print(which(is.na(sdResult))) # fast
sdResult[is.na(sdResult)] = 0
}
anyNA(sdResult)
save(sdResult, file="sd.Rdata")
# --------- load existing data here -----------------------------------
#load("beforeSd.Rdata") # 正常运行时间:小于1s
#load("sd.Rdata")
#sdResult <- eval(parse(text = sdResult))
#data3_remove0 <- eval(parse(text = test)) # make the loaded data "data.frame" again
#print(class(data3_remove0))
#print(data3_remove0[1:5,1:5])
# --------------- 画一下标准差的图,大概看一下------------
print(length(sdResult))
print(length(sort(sdResult)))
plot(x=c(1:length(sdResult)), y=log(sort(sdResult)), "l")
data4_trans = t(data3_remove0)
anyNA(data4_trans)
save(data4_trans,file="trans.Rdata")
# ---------------- kmeans 聚类---------------
load("trans.Rdata")
km <- kmeans(data4_trans, 3)
# don't print(km)
str(km)
# 怎么可视化呢,不知道
plot(data4_trans[c(1,2),], col = km$cluster)
points(km$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex=2)
View(sapply(data4, class))
## ------------------ Brand New Code Now!!! ----------------
# 定义一个函数,用于给出which
which2D <- function(logitMat) {
i = as.integer(which(logitMat) %% nrow(logitMat))
j = as.integer(floor(which(logitMat) / nrow(logitMat))+1)
#print(class(i[1]))
#print(sprintf("%d, %d \\n",i,j))
return (cbind(i,j))
}
load("trans.Rdata") # 458*16970
load("beforeSd.Rdata")
#data4 = as.numeric(data4) # 214855
anyNA((data4))
i = 214855 %% nrow(data4) # 53
j = 214855 / nrow(data4) +1# 470
print(data4[i,j]) # "91.8737221\001\b\020\001 \b\020\003\b\020399041"
print(data3_remove0[j,i]) # 470,53
print(is.numeric(data3_remove0[j,i]))
print(colnames(data3_remove0))
print(colnames(data3_remove0)[i])
print(rownames(data3_remove0)[j])
print(nrow(data4))
print(which(is.na(data4)))
naid = which2D(is.na(data4))
print(naid)
print(data4[NaId])
v <- c(1:5)
ind <- c(1:5)
tapply(v, ind, is.numeric)
df <- data.frame(c(1:5), c(6:10))
print(df)
ind <- data.frame(c(1:5), c(6:10))
#ind <- c(1,1,1,2,2)
print(ind)
res <- by(df, ind, is.numeric)
res
print(res)
m <- matrix(c(1:10), nrow=2)
m
ind <- matrix(c(1:10), nrow=2)
ind
#(x <- array(tapply(m, ind, is.numeric), dim=dim(m)))
返回类型是frame
data <- read.csv(RNAFile, encoding="UTF-8",sep="\t")
当第一个参数是要找的子串,第二个参数是一个list的时候,返回值是list中含此子串的元素的index。
indeces = grep("_Rep", colnames(data))
print(indeces)
data[c(indecies)]
data[-c(indecies)]
dim(my_frame)
print(nrow(data_no_rep))
print(ncol(data_no_rep))
print(dim(data_no_rep)[1])
print(dim(data_no_rep)[2])
#除去在50%以上样本表达值都为0 的基因,
for(i in c(1:nrow(data_no_rep))){
temp <- data_no_rep[i,]==0
print(sprintf('%dth col: %d', i,length(which(temp==TRUE))))
}
printf(sprintf("%d",i) # 一定要叠加printf,不然在for循环中不打印。