python selenium 爬取领英的数据

# -*- coding: utf-8 -*-
import os
import pickle
import time

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import csv
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
option = webdriver.ChromeOptions()
option.add_argument("headless")
brower = webdriver.Chrome(options=option)
brower = webdriver.Chrome()

def getLinkeninCookies():
    # get login taobao cookies
    url = 'https://www.linkedin.com/feed/'
    brower.get('https://www.linkedin.com/login/')
    time.sleep(1)
    brower.find_element_by_xpath('//*[@id="username"]').send_keys('[email protected]')
    brower.find_element_by_xpath('//*[@id="password"]').send_keys('123456yp')
    brower.find_element_by_xpath('//*[@id="app__container"]/main/div/form/div[3]/button').click()

    while True:
        print("Please login in linkedin.com!")
        time.sleep(1)
        # if login in successfully, url  jump to www.taobao.com
        while brower.current_url ==  url:
            tbCookies  = brower.get_cookies()
            brower.quit()
            cookies = {}
            for item in tbCookies:
                cookies[item['name']] = item['value']
            outputPath = open('linkeninCookies.pickle','wb')
            pickle.dump(cookies,outputPath)
            outputPath.close()
            return cookies
def readLinkeninCookies():
    # if hava cookies file ,use it 
    # if not , getTaobaoCookies()
    if os.path.exists('linkeninCookies.pickle'):
        readPath = open('linkeninCookies.pickle','rb')
        tbCookies = pickle.load(readPath)
    else:
        tbCookies = getLinkeninCookies()
    return tbCookies
def findTotalPage():
    tbCookies = readLinkeninCookies()
    brower.get("https://www.linkedin.com")
    brower.delete_all_cookies()
    for cookie in tbCookies:
        brower.add_cookie({
        "domain":".linkedin.com",
        "name":cookie,
        "value":tbCookies[cookie],
        "path":'/',
        "expires":None
    })
    brower.get("https://www.linkedin.com/search/results/all/?keywords=%E6%9F%AF%E9%A9%AC%E5%B7%A5%E7%A8%8B&origin=GLOBAL_SEARCH_HEADER")
    time.sleep(1)
    
    brower.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(1)
    Element=brower.find_element_by_xpath('/html/body/div[5]/div[7]/div[4]/div/div[2]/div/div[2]/div/div/div/div/div[1]/artdeco-pagination/ul/li[10]/button/span')
    return Element.text
def getInfo():
    tbCookies = readLinkeninCookies()
    brower.get("https://www.linkedin.com")
    time.sleep(2)
    brower.delete_all_cookies()
    for cookie in tbCookies:
        brower.add_cookie({
        "domain":".linkedin.com",
        "name":cookie,
        "value":tbCookies[cookie],
        "path":'/',
        "expires":None
    })
    brower.get('https://www.linkedin.com/search/results/all/?keywords=%E6%9F%AF%E9%A9%AC%E5%B7%A5%E7%A8%8B&origin=GLOBAL_SEARCH_HEADER')
    time.sleep(0.5)
    brower.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(2)
    ul = brower.find_elements_by_css_selector('.search-result__wrapper')
    person = []
    for li in ul:
        name = li.find_element_by_css_selector('span.actor-name').text
        job = li.find_element_by_css_selector('p.subline-level-1').text
        localtion = li.find_element_by_css_selector('p.subline-level-2').text
        rs = isElementExist(li,'p.search-result__snippets')
        if rs:
            company = li.find_element_by_css_selector('p.search-result__snippets').text
        else:
            company = None
        person.append([name,job,localtion,company])
    with open('data.csv','w',newline='',encoding='utf-8-sig') as file:
        writer = csv.writer(file)
        writer.writerow(["name","job","localtion","company"])
        m = len(person)
        for i in range(m):
            writer.writerow(person[i])

def isElementExist(parent,element):
        flag=True
        try:
            parent.find_element_by_css_selector(element)
            return flag
        except:
            flag=False
            return flag       
if __name__ == "__main__":
    
    getInfo()
    brower.quit()
    

网上看令很多资料,发现他们的都无法抓取数据,自己利用seleniun 和 driver 自己写令一个爬虫,可以按照搜索公司名字,爬取职工的基本信息

你可能感兴趣的:(python)