# -*- coding: utf-8 -*-
import os
import pickle
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import csv
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
option = webdriver.ChromeOptions()
option.add_argument("headless")
brower = webdriver.Chrome(options=option)
brower = webdriver.Chrome()
def getLinkeninCookies():
# get login taobao cookies
url = 'https://www.linkedin.com/feed/'
brower.get('https://www.linkedin.com/login/')
time.sleep(1)
brower.find_element_by_xpath('//*[@id="username"]').send_keys('[email protected]')
brower.find_element_by_xpath('//*[@id="password"]').send_keys('123456yp')
brower.find_element_by_xpath('//*[@id="app__container"]/main/div/form/div[3]/button').click()
while True:
print("Please login in linkedin.com!")
time.sleep(1)
# if login in successfully, url jump to www.taobao.com
while brower.current_url == url:
tbCookies = brower.get_cookies()
brower.quit()
cookies = {}
for item in tbCookies:
cookies[item['name']] = item['value']
outputPath = open('linkeninCookies.pickle','wb')
pickle.dump(cookies,outputPath)
outputPath.close()
return cookies
def readLinkeninCookies():
# if hava cookies file ,use it
# if not , getTaobaoCookies()
if os.path.exists('linkeninCookies.pickle'):
readPath = open('linkeninCookies.pickle','rb')
tbCookies = pickle.load(readPath)
else:
tbCookies = getLinkeninCookies()
return tbCookies
def findTotalPage():
tbCookies = readLinkeninCookies()
brower.get("https://www.linkedin.com")
brower.delete_all_cookies()
for cookie in tbCookies:
brower.add_cookie({
"domain":".linkedin.com",
"name":cookie,
"value":tbCookies[cookie],
"path":'/',
"expires":None
})
brower.get("https://www.linkedin.com/search/results/all/?keywords=%E6%9F%AF%E9%A9%AC%E5%B7%A5%E7%A8%8B&origin=GLOBAL_SEARCH_HEADER")
time.sleep(1)
brower.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(1)
Element=brower.find_element_by_xpath('/html/body/div[5]/div[7]/div[4]/div/div[2]/div/div[2]/div/div/div/div/div[1]/artdeco-pagination/ul/li[10]/button/span')
return Element.text
def getInfo():
tbCookies = readLinkeninCookies()
brower.get("https://www.linkedin.com")
time.sleep(2)
brower.delete_all_cookies()
for cookie in tbCookies:
brower.add_cookie({
"domain":".linkedin.com",
"name":cookie,
"value":tbCookies[cookie],
"path":'/',
"expires":None
})
brower.get('https://www.linkedin.com/search/results/all/?keywords=%E6%9F%AF%E9%A9%AC%E5%B7%A5%E7%A8%8B&origin=GLOBAL_SEARCH_HEADER')
time.sleep(0.5)
brower.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
ul = brower.find_elements_by_css_selector('.search-result__wrapper')
person = []
for li in ul:
name = li.find_element_by_css_selector('span.actor-name').text
job = li.find_element_by_css_selector('p.subline-level-1').text
localtion = li.find_element_by_css_selector('p.subline-level-2').text
rs = isElementExist(li,'p.search-result__snippets')
if rs:
company = li.find_element_by_css_selector('p.search-result__snippets').text
else:
company = None
person.append([name,job,localtion,company])
with open('data.csv','w',newline='',encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(["name","job","localtion","company"])
m = len(person)
for i in range(m):
writer.writerow(person[i])
def isElementExist(parent,element):
flag=True
try:
parent.find_element_by_css_selector(element)
return flag
except:
flag=False
return flag
if __name__ == "__main__":
getInfo()
brower.quit()
网上看令很多资料,发现他们的都无法抓取数据,自己利用seleniun 和 driver 自己写令一个爬虫,可以按照搜索公司名字,爬取职工的基本信息