RSelenium包抓取网易云音乐歌词(iframe框架的处理)

思路

Created with Raphaël 2.1.2 网易云音乐《中国嘻哈榜》 Step1:遍历《中国嘻哈榜》页面,获得每期链接programlink Step2:遍历所有期数链接,获得每期歌曲链接musiclink Step3:遍历每首歌曲链接,获得歌词lyric End

页面准备

library(RSelenium)
library(rvest)
base  <- "http://music.163.com"
url   <- "http://music.163.com/#/djradio?id=169"
remDr <- remoteDriver(browserName = "chrome")
shell("java -jar D:/R/library/Rwebdriver/selenium-server-standalone-3.7.1.jar", 
      wait = FALSE, invisible = FALSE)

获取节目链接

# Step1:封装函数ProgramlinkFunc,并执行,获得节目期数链接programlink(character形式)
ProgramlinkFunc <- function(remDr, url) {
  result <- data.frame()
  remDr$open()
  remDr$navigate(url)
  webElem <- remDr$findElements("css", "iframe")
  remDr$switchToFrame(webElem[[1]])
  j = 0
  while (TRUE) {
    j = j + 1
    destination <- remDr$getPageSource()[[1]] %>% read_html()
    totalpage   <- destination %>% html_nodes(".u-page a:nth-last-child(2)") %>% 
                   html_text()
    curpage     <- destination %>% html_nodes(".u-page .js-selected") %>% 
                   html_text()
    programlink <- destination %>% html_nodes(".col2 .tt a") %>% 
                   html_attr("href") %>% paste(base, ., sep = "")
    data        <- data.frame(programlink, stringsAsFactors = FALSE)
    result      <- rbind(result, data)
    if (curpage != totalpage) {
      cat(sprintf("第【%d】页节目期数链接抓取成功", j), sep = "\n")
      remDr$executeScript("arguments[0].click();", 
                          list(remDr$findElement("css", ".u-page .js-selected+a")))
    } else {
      cat(sprintf("第【%d】页节目期数链接抓取成功", j), sep = "\n")
      break
    }
  }
  cat("All work is done!", sep = "\n")
  return(result)
}
programlink <- ProgramlinkFunc(remDr, url) %>% unlist()
# programlink包含229个观测值(共229期节目的链接)

获取歌曲链接

# Step2:封装函数MusiclinkFunc,并执行,获得每一期节目的歌曲链接musiclink(character形式)
MusiclinkFunc <- function(link) {
  result <- data.frame()
  for (i in seq_along(link)) {
    remDr$navigate(link[i])
    webElem     <- remDr$findElements("css", "iframe")
    remDr$switchToFrame(webElem[[1]])
    destination <- remDr$getPageSource()[[1]] %>% read_html()
    num         <- destination %>% html_nodes(".tit .f-ff2") %>% html_text()
    musiclink   <- destination %>% html_nodes("span.txt a") %>% 
                   html_attr("href") %>% paste(base, ., sep = "")
    data        <- data.frame(num, musiclink, stringsAsFactors = FALSE)
    result      <- rbind(result, data)
    cat(sprintf("%s歌曲链接抓取成功", num), sep = "\n")
  }
  cat("All work is done!", sep = "\n")
  return(result)
}
musicinfo   <- MusiclinkFunc(programlink)
# musicinfo(data.frame形式)包含节目期数num和歌曲链接musiclink,共2063个观测值
musiclink   <- musicinfo %>% .$musiclink %>% unique()
# 从musicinfo中提取出每首歌的链接musiclink
# 用unique()去重,剩下417个观测值(歌曲有重复)

获取歌词

# Step3:封装函数LyricFunc,获取歌词lyric
LyricFunc <- function(musiclink) {
  remDr$navigate(musiclink)
  webElem <- remDr$findElements("css", "iframe")
  remDr$switchToFrame(webElem[[1]])
  remDr$executeScript("arguments[0].click();", 
                      list(remDr$findElement("css", "a#flag_ctrl")))
  # 点击歌曲页面的“展开”按钮
  destination <- remDr$getPageSource()[[1]] %>% read_html()
  songname    <- destination %>% html_nodes(".tit em") %>% html_text()
  lyric       <- destination %>% html_nodes("#lyric-content") %>% html_text()
  data        <- data.frame(musiclink, songname, lyric, stringsAsFactors = FALSE)
  return(data)
}

异常处理

# Step4:执行函数LyricFunc,for循环和tryCatch函数捕获异常
lyricinfo <- list()
for (i in seq_along(musiclink)) {
  if (!(musiclink[i] %in% names(lyricinfo))) {
    cat(paste("Doing", i, musiclink[i], "..."))
    ok <- FALSE
    counter <- 0
    while (ok == FALSE & counter < 3) {
      counter <- counter + 1
      output <- tryCatch({                  
        LyricFunc(musiclink[i])
      },
      error=function(e){
        Sys.sleep(2)
        e
      }
      )
      if ("error" %in% class(output)) {
        cat("NA...")
      } else {
        ok <- TRUE
        cat("Done.")
      }
    }
    cat("\n")
    lyricinfo[[i]] <- output
    names(lyricinfo)[[i]] <- musiclink[i]
  }
}
remDr$close()
cat("All work is done!", sep = "\n")
# 这一步收集到的lyricinfo(list形式)共有417个观测值
# 其中包括暂无歌词的页面返回的错误信息,也包括目标数据,需要进一步将二者分离

数据分离和提取

lyric <- lapply(lyricinfo, function(x) {
  if (unlist(x) %>% length() == 3) {
    return(x)
  } else {
    return(NULL)
  }
}) %>% .[!sapply(lyricinfo, is.null)] %>% do.call(rbind, .)
# 将lyricinfo中的向量逐个展开,由于目标数据包含3个变量(musiclink、songname、lyric)
# 因此目标向量展开后,长度应该等于3,利用此特性留下目标向量,将非目标向量值设为NULL
# 移除NULL值,作rbind操作,得到lyric(data.frame形式),共有397个观测值
# 说明剔除重复歌曲、暂无歌词的歌曲以后,仅有397首歌可用
View(lyric)
write.table(lyric, row.names = FALSE, sep = ",", "lyric.csv")
# View()函数查看数据并导出到本地

查看数据

RSelenium包抓取网易云音乐歌词(iframe框架的处理)_第1张图片
RSelenium包抓取网易云音乐歌词(iframe框架的处理)_第2张图片


总结

  • 网易云音乐界面的源代码几乎只有一个