使用sql读取一个文件
read.csv.sql(file=路径或者url,
sql = "select * from file", header = TRUE, sep = ",",
row.names, eol换行符, skip, filter, nrows, field.types,
colClasses, dbname = tempfile(), drv = "SQLite", ...)
read.csv2.sql(file, sql = "select * from file", header = TRUE, sep = ";",
row.names, eol, skip, filter, nrows, field.types,
colClasses, dbname = tempfile(), drv = "SQLite", ...)
数据框的sql语句
sqldf(x, stringsAsFactors = FALSE,
row.names = FALSE, envir = parent.frame(),
method = getOption("sqldf.method"), file.format = list(), dbname, drv = getOption("sqldf.driver"), user, password = "", host = "localhost", port, dll = getOption("sqldf.dll"), connection = getOption("sqldf.connection"), verbose = isTRUE(getOption("sqldf.verbose"))) Arguments
> df
x1 x2 x3 x4 x5
1 99 94 93 100 100
2 99 88 96 99 97
3 100 98 81 96 100
4 93 88 88 99 96
5 100 91 72 96 78
6 90 78 82 75 97
7 75 73 88 97 89
8 93 84 83 68 88
9 87 73 60 76 84
10 95 82 90 62 39
11 76 72 43 67 78
12 85 75 50 34 37
1、先来个最简单的,列出所有人的成绩:
> sqldf("select * from df")
Loading required package: tcltk
政治 语文 外语 数学 物理
1 99 94 93 100 100
2 99 88 96 99 97
3 100 98 81 96 100
4 93 88 88 99 96
5 100 91 72 96 78
6 90 78 82 75 97
7 75 73 88 97 89
8 93 84 83 68 88
9 87 73 60 76 84
10 95 82 90 62 39
11 76 72 43 67 78
12 85 75 50 34 37
2、带个过滤条件,x1学科成绩大于90
> sqldf("select * from df where x1 > 90")
x1 x2 x3 x4 x5
1 99 94 93 100 100
2 99 88 96 99 97
3 100 98 81 96 100
4 93 88 88 99 96
5 100 91 72 96 78
6 93 84 83 68 88
7 95 82 90 62 39
3、看看最常用的sql统计函数吧。。。
# 各科最好成绩
> sqldf("select max(x1), max(x2), max(x3), max(x4) from df")
max(x1) max(x2) max(x3) max(x4)
1 100 98 96 100
# 各科平均成绩,注意x5的平均成绩列用了别名
> sqldf("select avg(x1), avg(x2),avg(x3), avg(x4), round(avg(x5),2) avg_x5 from df")
avg(x1) avg(x2) avg(x3) avg(x4) avg_x5
1 91 83 77.16667 80.75 81.92
4、做个sql嵌套查询
# 查询x1学科成绩最好的人的x1到x5学科成绩
> sqldf("select * from df where x1 = (select max(x1) from df)")
x1 x2 x3 x4 x5
1 100 98 81 96 100
2 100 91 72 96 78
5、分类汇总
> df$class = c(rep("c1",6), rep("c2",6))
> df
# 计算各个班级x1,x2学科的最好成绩
> sqldf("select class, max(x1), max(x2) from df group by class")
class max(x1) max(x2)
1 c1 100 98
2 c2 95 84
6、很给力,还可以做表连接查询
> cl = data.frame(class=c("c1","c2"), cname=c("班级一","班级二"))
> cl
class cname
1 c1 班级一
2 c2 班级二
> sqldf("select cname, max(x1), max(x2) from df inner join cl on df.class=cl.class group by cname")
cname max(x1) max(x2)
1 班级二 95 84
2 班级一 100 98
>