今天用selenium和casperjs2种对https://class.coursera.org/nlp/lecture网站的ppt、pdf、srt、MP4的下载地址进行数据抓取
1、python+selenium
#!/usr/bin/python # -*- coding: utf-8 -*- from selenium import webdriver from bs4 import BeautifulSoup import time import sys reload(sys) sys.setdefaultencoding('utf-8') def catchDate(s): """页面数据提取""" soup = BeautifulSoup(s) z = [] m = soup.findAll("ul",class_="course-item-list-section-list") for obj in m: try: print obj.previous_sibling.find('h3').get_text() tmp = obj.findAll('li', class_="unviewed") for eachli in tmp: titleli = eachli.find('a').get_text() print ' '+titleli allaInEachDiv = eachli.find('div', class_="course-lecture-item-resource").findAll('a') for eacha in allaInEachDiv: print ' '+eacha['href'] except Exception, e: continue if(tmp != ""): z.append(tmp) return z starttime = time.time() driver = webdriver.PhantomJS(executable_path='C:\phantomjs-1.9.7-windows\phantomjs.exe') driver.get("https://class.coursera.org/nlp/lecture") html = driver.page_source content = catchDate(html) endtime = time.time() print endtime - starttime driver.quit
var casper = require("casper").create({ clientScripts: ["jquery-1.7.js"], stepTimeout: 120 * 1000, pageSettings: { loadImages: false }, verbose: true, logLevel: "error" }); var numberOfLinks = 0; var fs = require('fs'); var filename = 'content.txt'; var fullContent = ""; var startTime = new Date(), endTime; casper.start("https://class.coursera.org/nlp/lecture", function() { numberOfLinks = this.evaluate(function() { return __utils__.findAll('.course-item-list-section-list').length; }); this.echo(numberOfLinks + " items found"); }); getStartTime = function(){ this.echo(startTime); this.then(getcontent); }; getcontent = function() { fullContent = this.evaluate(function() { var content = ""; jQuery('.course-item-list-section-list').each(function() { var btitle = $(this).prev().find("h3").text(); content += btitle + '\r\n'; $(this).find("li").each(function(){ var stitle = $(this).find("a").first().text(); content += stitle + '\r'; $(this).find("div a").each(function(){ content += $(this).attr("href")+'\r'; }); content += '\r\n'; }); content += '\r\n\r\n'; }); return content; }); this.then(writefile); }; writefile = function() { this.echo('writing to ' + filename); fs.write(filename, fullContent, 'w'); this.then(getEndTime); }; getEndTime = function(){ endTime = new Date(); } casper.then(getStartTime); casper.then(function exitSystem() { this.echo(new Date() - startTime); casper.exit(); }); casper.run();
因为不熟练,感觉写的不太好,求大神对方法进行指导!!!
参考:
https://gist.github.com/imjared/5201405
http://casperjs.readthedocs.org/en/latest/modules/casper.html#evaluate
http://blog.csdn.net/u012577500/article/details/18185399
http://stackoverflow.com/questions/14894311/casperjs-windows-installation-how-is-it-done-the-correct-way-please
http://blog.csdn.net/sagomilk/article/details/20800543