获取目标: 1,电影名称(中文);2,导演以及主演;3,年代; 4,国别;5,评分;6,评价人数;7,是否可播放; 8,电影类型
readLines版本,造福厂财人
运行结果
源代码
url0 = "https://movie.douban.com/top250?start="
for(i in seq(0,225,25))
{
url=paste(url0,as.character(i),"&filter=",sep="")
web =readLines(url,encoding ='UTF-8')
#电影名称
moviename=web[grep("",director)-1)
age0=web[grep("导演: ",web)+1]
age0=gsub(" ", "", age0)
#年代
age=substr(age0,0,4)
#国别
country0=substr(age0,18,100)
country=substr(country0,1,regexpr(" ",country0)-1)
#评分
scoure=web[grep("",web)+2]
scoure=gsub(" ","",scoure)
scoure=substr(scoure,regexpr("average\">",scoure)+9,regexpr("",scoure)-1)
#评价人数
number=web[grep("",web)+4]
number=gsub(" ","",number)
number=substr(number,regexpr("",number)+6,regexpr("人",number)-1)
#是否可播放
play=web[grep("",web)+8]
play=gsub(" ","",play)
play=gsub("","[不可播放] ",play)
play=gsub("","[不可播放]",play)
play=substr(play,regexpr("\\[",play)+1,regexpr("]",play)-1)
#电影类型
class=substr(country0,regexpr(" / ",country0)+13,100)
#储存
TOP250=data.frame(
"电影名称"=moviename,
"导演以及主演"=director,
"年代"=age,
"国别"=country,
"评分"=scoure,
"评分人数"=number,
"是否可播放"=play,
"电影类型"=class)
if(i==0)
write.table(TOP250,"D:TOP250.csv",append=TRUE,row.names=F,col.names=T,sep = ",")
else
write.table(TOP250,"D:TOP250.csv",append=TRUE,row.names=F,col.names=F,sep = ",")
}