import requests
from lxml import html
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import os
path = 'C:\\Users\\taozz\\OneDrive\\selenium' #to store the doi list
main_url = 'http://www.nature.com' #the source of pdfs
scihub = 'http://sci-hub.tw/' #download pdfs at sci-hub
subjects = [] #add your favorite subjects here
subjects.append("https://www.nature.com/subjects/learning-and-memory")
subjects.append('https://www.nature.com/subjects/spatial-memory')
subjects.append('https://www.nature.com/subjects/consolidation')
subjects.append('https://www.nature.com/subjects/operant-learning')
subjects.append('https://www.nature.com/subjects/forgetting')
subjects.append('https://www.nature.com/subjects/attention#research-and-reviews')
option = webdriver.ChromeOptions()
#option.add_argument("headless") #run the scraping in the background
driver = webdriver.Chrome(options = option) #launch Chrome
for subject in subjects:
re = requests.get(subject) #get to the url
root = html.fromstring(re.content) #use lxml to parse the html content
#get the href of the articles
links = [main_url + link for link in root.xpath('//h3[@class="mb10 extra-tight-line-height"]/a/@href')]
#get the dois of the articles
dois = [link for link in root.xpath('//h3[@class="mb10 extra-tight-line-height"]/a/@data-track-dest')]
title = subject.split('/')[-1] + '_doi.txt'
if title not in os.listdir(path):
f = open(title, 'a+')
f.close()
with open(title, 'r+') as f:
lines = f.readlines()
new_dois=[]
for doi in dois:
if doi+'\n' not in lines:
new_dois.append(doi) #if the doi has been added
f.write(str(doi)+'\n') #we will not search it again
f.close()
ind = 0
fail_count = 0
while ind < len(new_dois):
try:
driver.get(scihub)
elem = WebDriverWait(driver, 1).until(
EC.presence_of_element_located((By.NAME, "request"))
) #find the input area
doi = dois[ind]
elem.send_keys(doi) #type in the doi
elem.send_keys(Keys.RETURN) #open it!
elem = driver.find_element_by_link_text('⇣ save') #find the save button without id
ActionChains(driver).click(elem).perform() #click the save button
fail_count = 0
except:
fail_count += 1 #sometimes sci-hub crashes at specific dois
if fail_count<6: #to jump out of the endless loop
ind -= 1
finally:
ind += 1
driver.close()
吃外卖看视频的时候不经意看看文件夹,下载下来的文献已经堆成山了,这还是在爬虫缓慢蠕动的前提下。可以想象,如果优化性能变成小飞虫,这些文献可能就没有看完的那一天了。。。。