爬取1688商家信息

# -*- coding: utf-8 -*-
"""
Created on Mon May 20 10:46:27 2019

@author: Administrator
"""

#! /usr/bin/env python
#coding:utf-8

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import urllib
import urllib.request
import sys
import os
import re
import csv
import numpy as np

# 解决中文报错的问题

csvfile = open("C:\\Users\\Administrator\\data1.csv", 'w+')
writer = csv.writer(csvfile)
writer.writerow(('a企业名称', 'b主页', 'c产品', 'd联系人', 'e电话', 'f地址','地区','经营范围'))
#url = 'https://s.1688.com/company/company_search.htm?keywords=%BB%AF%B9%A4&n=y&spm=a260k.635.1998096057.d1'
#url = 'https://s.1688.com/company/company_search.htm?n=y&pageSize=30&keywords=%BB%AF%B9%A4&offset=3&beginPage=20'
#url = 'https://s.1688.com/company/company_search.htm?keywords=%CA%B1%D7%B0&button_click=top&earseDirect=false&n=y&sortType=pop&pageSize=30&offset=3&beginPage=80'
#url = 'https://s.1688.com/company/company_search.htm?keywords=%BC%D2%BE%D3&button_click=top&earseDirect=false&n=y'
#url = 'https://s.1688.com/company/company_search.htm?n=y&pageSize=30&keywords=%B5%E7%C6%F8&offset=3&beginPage=55'
url = 'https://s.1688.com/company/company_search.htm?keywords=%B3%E8%CE%EF&button_click=top&earseDirect=false&n=y&netType=1%2C11'
loginUrl = 'https://login.taobao.com/member/login.jhtml?style=b2b&from=b2b&newMini=true&full_redirect=true&redirect_url=https%3A%2F%2Flogin.1688.com%2Fmember%2Fjump.htm%3Ftarget%3Dhttps%253A%252F%252Flogin.1688.com%252Fmember%252FmarketSigninJump.htm%253FDone%253Dhttp%25253A%25252F%25252Fmember.1688.com%25252Fmember%25252Foperations%25252Fmember_operations_jump_engine.htm%25253Ftracelog%25253Dlogin%252526operSceneId%25253Dafter_pass_from_taobao_new%252526defaultTarget%25253Dhttp%2525253A%2525252F%2525252Fwork.1688.com%2525252F%2525253Ftracelog%2525253Dlogin_target_is_blank_1688®=http%3A%2F%2Fmember.1688.com%2Fmember%2Fjoin%2Fenterprise_join.htm%3Flead%3Dhttp%253A%252F%252Fmember.1688.com%252Fmember%252Foperations%252Fmember_operations_jump_engine.htm%253Ftracelog%253Dlogin%2526operSceneId%253Dafter_pass_from_taobao_new%2526defaultTarget%253Dhttp%25253A%25252F%25252Fwork.1688.com%25252F%25253Ftracelog%25253Dlogin_target_is_blank_1688%26leadUrl%3Dhttp%253A%252F%252Fmember.1688.com%252Fmember%252Foperations%252Fmember_operations_jump_engine.htm%253Ftracelog%253Dlogin%2526operSceneId%253Dafter_pass_from_taobao_new%2526defaultTarget%253Dhttp%25253A%25252F%25252Fwork.1688.com%25252F%25253Ftracelog%25253Dlogin_target_is_blank_1688%26tracelog%3Dmember_signout_signin_s_reg'
driver = webdriver.Firefox()
time.sleep(3)
driver.get(loginUrl)
time.sleep(5)
driver.find_element_by_name("TPL_username").send_keys('账号')
driver.find_element_by_name("TPL_password").send_keys('密码')
driver.find_element_by_name("TPL_password").send_keys(Keys.ENTER)
time.sleep(5)
driver.get(url)
time.sleep(15)
user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
referer='https://s.1688.com/company/company_search.htm?keywords=%B3%E8%CE%EF&netType=1%2C11&earseDirect=false&button_click=top&n=y&pageSize=30&offset=3&beginPage=2'
cookie='cna=Y5rxE+PKc1oCAXrrzxugRBX9; __last_loginid__=wuqi356; lid=wuqi356; last_mid=b2b-181889983; ali_apache_track=c_mid=b2b-181889983|c_lid=wuqi356|c_ms=1; _cn_slid_=fhUkekh54F; ali_beacon_id=115.198.246.136.155382792318.290921.6; hng=CN%7Czh-CN%7CCNY%7C156; ali_ab=115.192.175.104.1557106627179.0; unb=181889983; __wapcsf__=1; cookie2=53c6bc8fc0e2aaa9bab424e391439235; t=3bc0828126b6ffa0710eaf546c2279f5; _tb_token_=7e33ee36e75e6; __cn_logon__=false; _csrf_token=1558313884528; h_keys="%u5ba0%u7269#%u4e94%u91d1#%u8bf7%u5e16"; ad_prefer="2019/05/20 10:42:10"; alicnweb=touch_tb_at%3D1558331385489%7Clastlogonid%3Dwuqi356%7ChomeIdttS%3D01469649499088246736339992082871925097%7ChomeIdttSAction%3Dtrue; l=bBgewTxevH6LWlG6BOfwZuI8zM_OoIRb8sPzw4Mg6ICPOv5M5yzAWZtV4OLHC3GVa6fkJ3J4RKM8B0Tn-y4EC; isg=BBISwTZDmk88guEVCSe4feVLY9g0ixA254i04txrvkWw77PpxbI8zEFBXwv2n45V'

for page in range(1, 100):
    try:
        title = driver.find_elements_by_css_selector(".list-item-title-text")
        product = driver.find_elements_by_xpath("//div[@class=\"list-item-detail\"]/div[1]/div[1]/a[1]")
        pattern = re.compile('
(.*?)
', re.S) telPattern = re.compile('
(.*?)
', re.S) #电话 membernamePattern = re.compile('(.*?)', re.S) addressPattern = re.compile('"address">(.*?)', re.S) # campanyPattern=re.compile('(.*?)', re.S) #地区 shopPattern=re.compile('(.*?)', re.S) for i in range(len(title)): titleValue = title[i].get_attribute('title') hrefValue = title[i].get_attribute('href') + 'page/contactinfo.htm' #网页地址 productValue = product[i].text headers = {'User-Agent' : user_agent, 'Accept' : '*/*', 'Referer' : referer,'Cookie':cookie} request = urllib.request.Request(hrefValue, headers=headers) response = urllib.request.urlopen(request) html = response.read().decode('GBK') # encode_type = chardet.detect(html) #html = html.decode(encode_type['utf-8']) #print(html) info = re.findall(pattern, html) try: info = info[0] except Exception: continue tel = re.findall(telPattern, info) try: tel = tel[0] tel = tel.strip() tel = tel.replace(' ', '-') print(tel) except Exception: continue membername = re.findall(membernamePattern, html) try: membername = membername[0] membername = membername.strip() except Exception: continue address = re.findall(addressPattern, html) try: address = address[0] address = address.strip() except Exception: address = '' campany = re.findall(campanyPattern, html) try: #print(campany) campany = campany[-1] campany = campany.strip() except Exception: continue shop = re.findall(shopPattern, html) try: shop = shop[0] shop = shop.strip() except Exception: continue print ('tel:' + tel) print ('membername:' + membername) print ('shop:' + shop) print ('campany:' + campany) data = (titleValue, title[i].get_attribute('href'), productValue, membername, tel, address,campany,shop) writer.writerow(data) for t in title: print( t.get_attribute('title')) print (t.get_attribute('href') + 'page/contactinfo.htm') print (len(product)) for p in product: a = repr(p.text) a = a.encode('gbk', 'ignore') print (a) print ('#' * 50) js = 'var q=document.documentElement.scrollTop=30000' driver.execute_script(js) time.sleep(2) page = driver.find_elements_by_css_selector("a[class=page-next]") page = page[0] page.click() time.sleep(20) except Exception: print ('error') continue csvfile.close() driver.close()

爬取1688商家信息

你可能感兴趣的:(python)