昨天爬取的是影评,今天试了下爬top100榜单,网上这类资源有很多,但是自己也摸索了挺久,就记录一下吧。
import re
import requests
import time
def get_data(url,headers):
r = requests.get(url,headers = headers)
r.encoding = r.apparent_encoding
if r.status_code == 200:
return r.text
else:
print(error)
def view_data(html):
#利用正则表达式匹配想要的信息
pattern = re.compile('"\stitle=".*?"\sclass')
data = re.findall(pattern,html)
for item in data:
#从字符串的第9位输出到倒数第7位
print(item[9:-7])
save_data(item[9:-7] + '\n')
def save_data(item):
#不把encoding设置成下面这样就会乱码,但是保存为txt就不会。。。
with open("maoyan100.csv",mode = 'a',encoding = 'utf_8_sig') as m:
m.write(item)
m.close()
def get_urls():
urls = []
for i in range(10):
urls.append('https://maoyan.com/board/4?offset=' + str(i*10))
return urls
#第一页的offset是0,第二页是10,以此类推
if __name__ == "__main__":
#记录开始时间
begin_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
urls = get_urls()
with open("maoyan100.csv",'wt',encoding = 'utf_8_sig') as m:
m.close()
print('开始爬取:' + begin_time)
for url in urls:
html = get_data(url,headers)
view_data(html)
time.sleep(1)
#记录结束时间
end_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
print('爬取完毕:' + end_time)
import re
import requests
import time
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
def get_data(url,headers):
r = requests.get(url,headers = headers)
r.encoding = r.apparent_encoding
if r.status_code == 200:
return r.text
else:
print(error)
def view_data(html):
pattern = re.compile('"\stitle=".*?"\sclass')
data = re.findall(pattern,html)
for item in data:
print(item[9:-7])
save_data(item[9:-7] + '\n')
def get_urls():
urls = []
for i in range(10):
urls.append('https://maoyan.com/board/4?offset=' + str(i*10))
return urls
def save_data(item):
with open("maoyan100.csv",mode = 'a',encoding = 'utf_8_sig') as m:
m.write(item)
m.close()
if __name__ == "__main__":
begin_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
urls = get_urls()
with open("maoyan100.csv",'wt',encoding = 'utf_8_sig') as m:
m.close()
print('开始爬取:' + begin_time)
for url in urls:
html = get_data(url,headers)
view_data(html)
time.sleep(1)
end_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
print('爬取完毕:' + end_time)
今天还做了昨天影评的词云,就不贴上来了,也没什么用。。下次试试排名、电影名称、评分、导演、演员一起爬。