爬虫(一)爬虫的20个小案例

1.百度首页

1.导入requests模块

import requests

2.获取百度url

url = "https://www.baidu.com/"

3.请求方式:request Method:GET

3.1做伪装,添加headers:

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie':'BIDUPSID=F6BBCD59FE2A646812DB8DAE641A0BE5; PSTM=1573713375; BAIDUID=F6BBCD59FE2A6468D0329C1E2F60212F:FG=1; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1452_21098_29568_29221_26350; delPer=0; BD_CK_SAM=1; PSINO=2; H_PS_645EC=50d5uY51q2qJG%2BVlK7rlPmCgY73TcN9qKRz4sPKuBII1GIkIx4QkChitGd4; BDSVRTM=209'}

4.发送请求,返回响应

response = requests.get(url=url,headers=headers)

5.查看响应内容

(1)response.text:返回文本信息 

(2)response.content  返回字节流信息   .content.decode('utf-8')  解码

6.写入本地文件

with open("baidu.html",'w',encoding="utf-8") as fp:

         fp.write(response.content.decode("utf-8"))

2.百度贴吧

import requests
kw = input("请输入你要访问的贴吧名称:")
url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}&fr=search&pn='.format(kw)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}

for page in range(10):
    full_url = url+str(page*50)

    #发起请求
    response = requests.get(url=full_url,headers=headers).content.decode("utf-8")

    #保存
    with open("tieba{}.html".format(page+1),'w',encoding='utf-8')as fb:
        fb.write(response)

===============补充内容:::

#get请求参数:
params = {
    'ie': 'utf-8',
    'kw': 'python',
    'fr': 'search',
    'red_tag': 'y2156030250'
}
url = 'http://tieba.baidu.com/f?'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}
response = requests.request('get',url=url,params=params,headers=headers).content.decode('utf-8')
print(response)

3.百度翻译

import requests
import json

def fanyi(kw):
    # 1.url
    url = 'https://fanyi.baidu.com/sug'

    # 请求方式 POST

    # 2.参数:
    data = {'kw': kw}

    # 3.请求
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}
    response = requests.post(url=url,data=data,headers=headers).content.decode('utf-8')
    response = json.loads(response)
    for i in response['data']:
        word = i["k"]
        translate = i["v"]
        print(word+":"+translate+'\n')
        with open('{}.txt'.format(kw),'a',encoding='utf-8') as fp:
            fp.write(word+":"+translate+'\n')

if __name__ == '__main__':
    while True:
        kw = input("请输入你要翻译的内容======>")
        fanyi(kw)

4.人人网

import requests

# 1.url
url = 'http://www.renren.com/PLogin.do'

# method:POST
# 2.参数:
data = {
    'email':'18811176939',
    'password':'123457'
}

# 3.请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
response = requests.post(url=url,data=data,headers=headers).content.decode('utf-8')
with open('renren.html','w',encoding='utf-8') as fp:
    fp.write(response)

5.代理设置

import requests
import random
#建立代理池
proxies = [
    {'http':'124.113.217.5:9999','https':''},
    {'http':'183.164.239.177','https':''}
]
#随机选择代理ip
prox = random.choice(proxies)
print(prox)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}

response = requests.get('https://www.kuaidaili.com/free/inha/',proxies=prox,headers=headers).content.decode('utf-8')
print(response)

6.高德地图

import requests
import json
def weatherlist(url1,url2,headers,proxies):
    response = requests.get(url=url1, headers=headers, proxies=proxies).content.decode('utf-8')
    response = json.loads(response)
    for i in response["data"]["cityByLetter"].values():
        for j in i:
            adcode = j["adcode"]
            name = j["name"]
            full_url = url2+adcode
            response = requests.get(url=full_url, headers=headers, proxies=proxies).content.decode('utf-8')
            response = json.loads(response)
            print(response)
            try:
                if response["data"]["data"]:
                    for weather in response["data"]["data"]:
                        for weather in weather['forecast_data']:
                            weather_name = weather['weather_name']
                            temp_min = weather['min_temp']
                            temp_max = weather['max_temp']
                            with open('weather_list.txt', 'a', encoding='utf-8') as fp:
                                fp.write("城市:"+name+ " 天气: "+weather_name+" 最高气温: "+ temp_max
                                    +" 最低气温: "+temp_min+'\n')
            except:
                print('空')
if __name__ == '__main__':
    url1 = 'https://www.amap.com/service/cityList'
    url2 = 'https://www.amap.com/service/weather?adcode='
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie':'BIDUPSID=F6BBCD59FE2A646812DB8DAE641A0BE5; PSTM=1573713375; BAIDUID=F6BBCD59FE2A6468D0329C1E2F60212F:FG=1; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1452_21098_29568_29221_26350; delPer=0; BD_CK_SAM=1; PSINO=2; H_PS_645EC=50d5uY51q2qJG%2BVlK7rlPmCgY73TcN9qKRz4sPKuBII1GIkIx4QkChitGd4; BDSVRTM=209'}
    proxies = {'http':'124.113.217.5:9999','https':''}
    weatherlist(url1,url2,headers,proxies)

7.金山词霸

import requests
import json

def fanyi(url,headers,proxies,data):
    response = requests.post(url=url,headers=headers,proxies=proxies,data=data).content.decode('utf-8')
    response = json.loads(response)
    print(response)
if __name__ == '__main__':
    url = 'http://fy.iciba.com/ajax.php?a=fy'
    w = input("请输入你要翻译的单词=======>")
    data = {
        'f': 'auto',
        't': 'auto',
        'w': w
    }

    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie': 'UM_distinctid=16e9cecbfbb12e-0701242418f796-54133310-100200-16e9cecbfbc135; CNZZDATA1256573702=520451556-1574586989-%7C1574586989; __gads=ID=80bd4d4328d6d249:T=1574590731:S=ALNI_MaVD1f5SOmn3mHzHr4qp3LOGH6REA','a': 'fy'}

    fanyi(url,headers,proxies,data)

8.人人网

#第一种方法==========


import requests
url = 'http://www.renren.com/PLogin.do'
data = {
    'email':'**********',
    'password':'*******'
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
response = requests.post(url=url,headers=headers,data=data).content.decode("utf-8")

url2 = 'http://www.renren.com/964508169/newsfeed/photo'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'Cookie': 'anonymid=k39t0b6ygzu4ll; depovince=GW; _r01_=1; jebe_key=a4314260-8a4a-4342-8995-c9e648a7a1b8%7C9a7ca8d077b15761409e1298ab30421f%7C1574406709891%7C1%7C1574406774042; JSESSIONID=abcn0y6W3hiuoaYCyCG6w; ick_login=0e033cf7-03c1-4f2d-87c4-3792223ab81b; first_login_flag=1; ln_uact=18811176939; ln_hurl=http://hdn.xnimg.cn/photos/hdn421/20191125/1040/main_v6a1_0aac000038d2195a.jpg; wp_fold=0; jebecookies=a12f9f8a-7b0f-413a-b5b9-17fc13c3e2e5|||||; _de=E1BF45DCCBECABDD1B5D679B401790AD; p=4eca4158940cc418e6f44861141b88f89; t=f0efd7f6c081ed1e0658bfbb470710c59; societyguester=f0efd7f6c081ed1e0658bfbb470710c59; id=964508169; xnsid=5f89fe84; ver=7.0; loginfrom=null; jebe_key=a4314260-8a4a-4342-8995-c9e648a7a1b8%7Cf096e2c5efb5a6aed844642618d7e763%7C1574650138110%7C1%7C1574650203722'}
response2 = requests.get(url=url2,headers=headers).content.decode('utf-8')
with open('ren.html','w',encoding='utf-8') as fp:
    fp.write(response2)




#第二种方法

# (1)创建session对象
sess = requests.session()
# (2)模拟登陆,记录客户端身份信息
url1 = 'http://www.renren.com/PLogin.do'
data = {
    'email':'18811176939',
    'password':'123457'
}
sess.post(url = url1,data=data)

# (3)访问首页信息
url2 = 'http://www.renren.com/964508169/newsfeed/photo'
resonse = sess.get(url= url2).content.decode('utf-8')

with open ('renren.html','w',encoding='utf-8') as fp:
    fp.write(resonse)

9.豆瓣电影

import requests
import json

def douban(url,headers,proxies):
    content = requests.get(url=url, headers=headers, proxies=proxies).content.decode('utf-8')
    movie_data = json.loads(content)
    data_list = []
    for movie in movie_data:
        data_dict = {}
        data_dict['title'] = movie.get('title')
        data_dict['regions'] = movie.get('regions')
        data_dict['types'] = movie.get('types')
        data_dict['url'] = movie.get('url')
        data_dict['actors'] = movie.get('actors')
        data_list.append(data_dict)
    json_data = json.dumps(data_list,ensure_ascii=False)
    with open('moviedata.json','w',encoding='utf-8') as fp:
        fp.write(json_data)

if __name__ == '__main__':
    url = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=0&limit=20'
    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
    douban(url,headers,proxies)

10.猫眼电影

import requests
import json
import re

def maoyan(url,headers,proxies):
    response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
    pattern = re.compile(r'
[\d\D]*?
') movie_list = pattern.findall(response) movielist = [] for movie in movie_list: movie_dict = {} #名字 pattern = re.compile(r'}">(.*?)

') title = pattern.findall(movie)[0] movie_dict["title"] = title print(title) #排名 pattern = re.compile(r'') rank = pattern.findall(movie)[0] movie_dict["rank"] = rank print(rank) #评分 pattern = re.compile(r'

(\d\.)(\d)

') score = pattern.findall(movie) score = score[0][0]+score[0][1] movie_dict["score"] = score print(score) #图片 pattern = re.compile(r'

11.动物世界

# 需求:
# 1、动物网站
# http://www.iltaw.com/animal/all动物世界
# 各种动物图片,中文名,英文名,学名,简介,食性,繁殖,习性,分
# 布,外形特征,生态习性,生长繁殖,地理分布地区,

import requests
import json
import re

#判断是否为空
def panduan(obj):
    if obj:
        return obj[0]
    else:
        return ''

def dongwu(url,headers,proxies):
    content = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
    pattern = re.compile(r'
  • [\d\D]*?
  • ') animal_list = pattern.findall(content) animallist = [] for animal in animal_list: animal_dict = {} #详情页链接 pattern = re.compile('

    ') detail = pattern.findall(animal)[0] animal_dict["detail"] = detail animallist.append(animal_dict) #循环列表,取链接 animaldetails = [] for animal_href in animallist: animaldict = {} href = animal_href.get("detail") response = requests.get(url=href,headers=headers,proxies=proxies).content.decode('utf-8') #图片 pattern = re.compile('') name = pattern.findall(response) animaldict["中文名"] = panduan(name) #英文名 pattern = re.compile('
    英文名:(.*?);
    ') engname = pattern.findall(response) animaldict["英文名"] = panduan(engname) #学名 pattern = re.compile('
    学名:(.*?)') xuename = pattern.findall(response) animaldict["学名"] = panduan(xuename) #简介 pattern = re.compile('。
    (.*?)

    ') jianjie = pattern.findall(response) animaldict["简介"] =panduan(jianjie) #食性 pattern = re.compile('食性: (.*?)') food = pattern.findall(response) animaldict["食性"] = panduan(food) #繁殖 pattern = re.compile('繁殖: (.*?)') fanzhi = pattern.findall(response) animaldict["繁殖"] = panduan(fanzhi) #习性 pattern = re.compile('习性: (.*?)') xixing = pattern.findall(response) animaldict["习性"] = panduan(xixing) #分布 pattern = re.compile('分布: (.*?)') fenbu = pattern.findall(response) animaldict["分布"] = panduan(fenbu) #外形特征 pattern = re.compile('外形特征.*?
    (.*?)
    ',re.S) waixing = pattern.findall(response) animaldict["外形特征"] = panduan(waixing) #生态习性 pattern = re.compile('生态习性.*?
    (.*?)
    ',re.S) shengtai = pattern.findall(response) animaldict["生态习性"] = panduan(shengtai) #生长繁殖 pattern = re.compile('生长繁殖.*?
    (.*?)
    ', re.S) shengzhang = pattern.findall(response) animaldict["生长繁殖"] = panduan(shengzhang) #地理分布 pattern = re.compile('地理分布.*?
    (.*?)
    ', re.S) dili = pattern.findall(response) animaldict["地理分布"] = panduan(dili) #存入列表中 animaldetails.append(animaldict) #转化成json格式,储存成json文件 data = json.dumps(animaldetails,ensure_ascii=False) with open('animal.json','w',encoding='utf-8') as fp: fp.write(data) if __name__ == '__main__': url = 'http://www.iltaw.com/animal/all' proxies = {'http': '', 'https': '117.28.96.160:9999'} headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 'Cookie': '__cfduid=d45abc9c47821980f926a234bc1d141771574671242; PHPSESSID=1111adcbe2de581fc10d8912a9100db8; Hm_lvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; Hm_lpvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; UM_distinctid=16ea1b94f0f142-0edd2ae99214fd-54133310-100200-16ea1b94f108d2; CNZZDATA1000267376=387552791-1574669922-%7C1574669922'} dongwu(url,headers,proxies)

    12.股吧

    import requests
    import json
    import re
    
    def guba(url,headers,proxies):
        response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
        pattern = re.compile(r'
      [\w\W]*?
    ') list = pattern.findall(response)[0] li_pattern = re.compile(r'
  • ([\d\D]*?)
  • ') li_list = li_pattern.findall(list) gubalist = [] for li in li_list: dict = {} pattern = re.compile(r'(.*?)',re.S) data = pattern.findall(li) print(data[0].strip()) print(data[1].strip()) dict["阅读量"] = data[0].strip() dict["评论数"] = data[1].strip() #名称 pattern = re.compile(r'.* class="balink">(.*?)
    ]') baname = pattern.findall(li) print(baname[0]) dict["吧名称"] = baname[0] #标题 pattern = re.compile(r'(.*?)') author = pattern.findall(li) print(author[0]) dict["作者"] = author[0] #更新时间 pattern = re.compile(r'(.*?)') time = pattern.findall(li) print(time[0]) dict["更新时间"] = time[0] #详情url pattern = re.compile(r'

    13.大药房

    
    import requests
    import re
    import json
    
    def yaofang(url,headers,proxies):
        response = requests.get(url=url,headers=headers,proxies=proxies).content.decode( 'gbk')
        pattern = re.compile(r'
  • (.*?)',re.S) name = pattern.findall(response) if name: dict["店名"] = name[0] else: dict["店名"] = "自营" # detail_list.append(dict) #详情 pattern = re.compile(r'
    [\w\W]*?
    ') table_list = pattern.findall(response) for detail in table_list: dict2 = {} # print(detail) pattern = re.compile(r'商品名称:.*?(.*?)',re.S) name = pattern.findall(detail)[0] dict["商品名称"] = name #品牌 pattern = re.compile(r'品  牌:.*?(.*?)',re.S) pinpai = pattern.findall(detail)[0] dict["品牌"] = pinpai #规格 pattern = re.compile(r'规  格:.*?(.*?)', re.S) guige = pattern.findall(detail)[0] dict["规格"] = guige # 规格 pattern = re.compile(r'重  量:.*?(.*?)', re.S) zhongliang = pattern.findall(detail)[0] dict["重量"] = zhongliang # 生产厂商 pattern = re.compile(r'生产厂商:.*?(.*?)', re.S) shengchan = pattern.findall(detail)[0] dict["生产厂商"] = shengchan # # 批准文号 # pattern = re.compile(r'批准文号:.*?(.*?)', re.S) # pizhun = pattern.findall(detail)[0].split() # # dict2["批准文号"] = pizhun # # print(pizhun) detail_list.append(dict) data = json.dumps(detail_list,ensure_ascii=False) with open('yaofan.json','w',encoding='utf-8') as fp: fp.write(data) if __name__ == '__main__': for i in range(1,6): url = 'https://www.111.com.cn/categories/953710?tp=10-{}'.format(i) proxies = {'http': '117.95.192.4:9999', 'https': ''} headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 'Cookie': '__cfduid=d45abc9c47821980f926a234bc1d141771574671242; PHPSESSID=1111adcbe2de581fc10d8912a9100db8; Hm_lvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; Hm_lpvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; UM_distinctid=16ea1b94f0f142-0edd2ae99214fd-54133310-100200-16ea1b94f108d2; CNZZDATA1000267376=387552791-1574669922-%7C1574669922'} yaofang(url,headers,proxies)
  • 14.xpath 猫眼电影

    from lxml import etree
    import requests
    import json
    
    def maoyan(url,headers,proxies):
        response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
        tree = etree.HTML(response)
        dd_list = tree.xpath('//dd')
        list = []
        for dd in dd_list:
            dict = {}
            #排名
            paiming = dd.xpath('./i/text()')[0]
            dict["排名"] = paiming
            #名字
            name = dd.xpath('./a/@title')[0]
            dict["名字"] = name
            #演员
            star = dd.xpath('.//p[@class="star"]/text()')[0].strip().replace('主演:','')
            dict["主演"] = star
            #上映时间
            time = dd.xpath('.//p[@class="releasetime"]/text()')[0].replace('上映时间:','')
            dict["上映时间"] = time
            #图片链接
            img = dd.xpath('.//img[@class="board-img"]/@data-src')[0]
            dict['图片链接'] = img
            #详情链接
            detail = dd.xpath('./a/@href')[0]
            dict["详情链接"] = detail
            list.append(dict)
        data = json.dumps(list,ensure_ascii=False)
        with open('maoyan.json','w',encoding='utf-8') as fp:
            fp.write(data)
    
    
    
    if __name__ == '__main__':
        url = 'https://maoyan.com/board'
        proxies = {'http': '125.110.90.93:9000', 'https': ''}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
        maoyan(url,headers,proxies)

    15.链家

    """
    链家网站爬取
    一、案例需求:
    链家:https://bj.fang.lianjia.com/loupan/
    1、获取所有的城市的拼音
    2、根据拼音去拼接url,获取所有的数据。
    3、列表页:图片,楼盘名称,均价,建筑面积,区域,商圈
    详情页:户型(["8室5厅8卫", "4室2厅3卫", "5室2厅2卫"]),朝向,图片(列表),用户点评(选爬)
    
    项目地址:通燕高速耿庄桥北出口中化石油对面
    售楼处地址:通燕高速北侧耿庄桥出口北500米潞苑南大街(接待时间 9:00 - 18:00)
    开发商:石榴置业集团股份有限公司
    物业公司:浙江省绿城物业服务有限公司
    最新开盘:2018.01.20 转成时间戳
    物业类型:别墅
    交房时间:2016年05月10日 转成2016-05-10
    容积率:1.20
    产权年限:70年
    绿化率:30%
    规划户数:173
    物业费用:6.91~7.13元/m2/月(不要后面的:元/m2/月)
    车位情况:地下车位数584(只要数字)
    供暖方式:集中供暖
    供水方式:民水
    供电方式:民电
    建筑类型:板楼
    嫌恶设施:暂无
    占地面积:39,600㎡(不要单位)
    建筑面积:40,000㎡(不要单位)
    
    """
    import pymysql
    class PyMysql:
        def __init__(self):
            ##1.链接数据库
            self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
            ##2.创建游标
            self.c = self.db.cursor()
        def sql_caozuo(self,sql):
            ##3.执行sql语句
            self.c.execute(sql)
            ##查看回执
            print(self.c.fetchall())
        ##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
        def __del__(self):
            self.db.commit()
            self.c.close()
            self.db.close()
    
    import requests
    from lxml import etree
    import json
    import re
    import math
    
    def panduan(obj):
        if obj:
            return obj[0]
        else:
            return 'None'
    
    
    def pinyin(url2,headers,proxies):
        response = requests.get(url=url2,headers=headers,proxies=proxies).text
        response = str(response)
        partten = re.compile(r'"short":"(.*?)"')
        pinyin_list= partten.findall(response)
    
        for pinyin in pinyin_list:
            full_url = 'https://{}.fang.lianjia.com/loupan/'.format(pinyin)
            try:
                response1 = requests.get(url=full_url,headers=headers,proxiaes=proxies).content.decode('utf-8')
                tree = etree.HTML(response1)
            except:''
            # 提取页数:
            total_page = tree.xpath('//div[@class="page-box"]/@data-total-count')
            for page in range(1, math.ceil(int(total_page[0]) / 10) + 1):
                url = full_url+'pg{}'.format(page)
                # print(url)
                response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
                tree = etree.HTML(response)
                li_list = tree.xpath('//ul[@class="resblock-list-wrapper"]/li')
                lianjia_list=[]
                for li in li_list:
                    lianjia_dict = {}
                    # (1)图片:
                    src = li.xpath('.//img[@class="lj-lazy"]/@data-original')
                    print(panduan(src))
                    tupian = panduan(src)
                    lianjia_dict["图片链接"] = tupian
                    # (2) 楼盘名称,
                    title = li.xpath('./a/@title')
                    print(panduan(title))
                    loupan = panduan(title)
                    lianjia_dict["楼盘名称"] = loupan
                    # (3) # 均价,
                    price = li.xpath('.//span[@class="number"]/text()|.//span[@class="desc"]/text()')
                    price_new = ''.join(price).replace('\xa0', '')
                    print(price_new)
                    lianjia_dict["均价"] = price_new
    
    
                    # (4)建筑面积,
                    area_list = li.xpath('.//div[@class="resblock-area"]/span/text()')
                    print(panduan(area_list))
                    jianzhu = panduan(area_list)
                    lianjia_dict["建筑面积"] = jianzhu
                    #(5) 区域
                    add = li.xpath('.//div[@class="resblock-location"]/span/text()')
                    quyu = panduan(add)
                    print(panduan(add))
                    lianjia_dict["区域"] = quyu
                    #(6)商圈
                    shop = li.xpath('.//div[@class="resblock-location"]/a/text()')
                    print(panduan(shop))
                    shangquan = panduan(shop)
                    lianjia_dict["商圈"] = shangquan
    
                    #详情页url:
                    href = li.xpath('.//a[@class="resblock-room"]/@href')[0].replace("/loupan/","")
                    detail_url = full_url+href
                    # print(detail_url)
                    response = requests.get(url=detail_url,headers=headers,proxies=proxies).content.decode('utf-8')
                    detail_tree = etree.HTML(response)
                    # 访问详情页
                    # (5)户型
                    li_list = detail_tree.xpath('//li[@data-index="0"]/ul')
                    huxinglist = []
                    chaoxianglist = []
                    imglist = []
                    for li in li_list:
    
                        #户型
                        huxing = li.xpath('.//div[@class="content-title"]/text()')
                        huxinglist.append(panduan(huxing).strip())
                        #朝向
                        chaoxiang = li.xpath('.//div[@class="content-area"]/text()')
                        chaoxianglist.append(panduan(chaoxiang))
                        #img
                        img = li.xpath('.//img/@src')[0]
                        imglist.append(img)
                    print(imglist)
                    lianjia_dict["户型图片"] = imglist
    
                    #将户型朝向以键值对的形式存入字典中
                    huchao_list = []
                    for huxing,chaoxiang in zip(huxinglist,chaoxianglist):
                        dict = {}
                        dict[huxing] = chaoxiang
                        huchao_list.append(dict)
                    print(huchao_list)
                    lianjia_dict["户型朝向"] = huchao_list
                    #楼盘详情信息
                    xhref = detail_tree.xpath('.//div[@class="more-building"]/a/@href')[0]
                    href = full_url.replace('/loupan/','')+xhref
                    # print(href)
                    response_xinxi = requests.get(url=href, headers=headers,proxies=proxies).content.decode('utf-8')
                    xinxi_tree = etree.HTML(response_xinxi)
                    xinxi_list = xinxi_tree.xpath('//div[@class="big-left fl"]')
                    for xinxi in xinxi_list:
                        # 售楼处地址
                        shoulou = xinxi.xpath('.//ul[@class="x-box"][1]/li[@class="all-row"][2]/span[@class="label-val"]/text()')
                        sl = panduan(shoulou)
                        print(sl)
                        lianjia_dict["售楼处地址"] = sl
    
                        # 开发商
                        kaifa = xinxi.xpath('.//ul[@class="x-box"][1]/li[@class="all-row"][3]/span[@class="label-val"]/text()')
                        kf = panduan(kaifa)
                        print(kf)
                        lianjia_dict["开发商"] = kf
                        # 物业公司
                        wuye = xinxi.xpath('.//ul[@class="x-box"][3]//span[@class="label-val"]/text()')
                        wy = panduan(wuye)
                        print(wy)
                        lianjia_dict["物业公司"] = wy
                        # 最新开盘
                        kaipan = xinxi.xpath('./ul[@class="fenqi-ul"]/li[3]/span[@class="fq-td fq-open"]/span/text()')
                        kp = panduan(kaipan)
                        print(kp)
                        lianjia_dict["最新开盘"] = kp
                        # 物业类型
                        wuyetype = xinxi.xpath('.//ul[@class="x-box"][2]//span[@class="label-val"]/text()')
                        wuyet = panduan(wuyetype)
                        print(wuyet)
                        lianjia_dict["物业类型"] = wuyet
                        # 交房时间
                        jiaofangtime = xinxi.xpath('./ul[@class="fenqi-ul"]/li[@class="fq-nbd"]/span[@class="fq-td fq-open"]/span/text()')
                        jf = panduan(jiaofangtime)
                        print(jf)
                        lianjia_dict["交房时间"] = jf
                        # # 容积率
                        rongji = xinxi.xpath('./ul[@class="x-box"][2]/li[4]/span[@class="label-val"]/text()')
                        rj = panduan(rongji).strip()
                        print(rj)
                        lianjia_dict["容积率"] = rj
                        # 产权年限
                        chanquan = xinxi.xpath('.//ul[@class="x-box"][2]/li[8]/span[@class="label-val"]/text()')
                        cq = panduan(chanquan).strip()
                        print(cq)
                        lianjia_dict["产权年限"] = cq
                        # 绿化率
                        lvhua = xinxi.xpath('.//ul[@class="x-box"][2]/li[2]/span[@class="label-val"]/text()')
                        lh = panduan(lvhua).strip()
                        print(lh)
                        lianjia_dict["绿化率"] = lh
                        # 规划户数
                        yonghu = xinxi.xpath('.//ul[@class="x-box"][2]/li[7]/span[@class="label-val"]/text()')
                        yh = panduan(yonghu)
                        print(yh)
                        lianjia_dict["规划户数"] = yh
                        # 物业费用
                        wuyefei = xinxi.xpath('.//ul[@class="x-box"][3]/li[3]/span[@class="label-val"]/text()')
                        wyf = panduan(wuyefei)
                        print(wyf)
                        lianjia_dict["物业费用"] = wyf
                        # 车位情况
                        chewei = xinxi.xpath('.//ul[@class="x-box"][3]/li[7]/span[@class="label-val"]/text()')
                        cw = panduan(chewei).strip()
                        print(cw)
                        lianjia_dict["车位情况"] = cw
                        # 供暖方式
                        gongnuan = xinxi.xpath('.//ul[@class="x-box"][3]/li[4]/span[@class="label-val"]/text()')
                        gn = panduan(gongnuan)
                        print(gn)
                        lianjia_dict["供暖方式"] = gn
                        # 供水方式
                        gongshui = xinxi.xpath('.//ul[@class="x-box"][3]/li[5]/span[@class="label-val"]/text()')
                        gs = panduan(gongshui)
                        print(gs)
                        lianjia_dict["供水方式"] = gs
                        # 供电方式
                        gongdian = xinxi.xpath('.//ul[@class="x-box"][3]/li[6]/span[@class="label-val"]/text()')
                        gd = panduan(gongdian)
                        print(gd)
                        lianjia_dict["供电方式"] = gd
                        # # 嫌恶设施
                        # 占地面积
                        mianji = xinxi.xpath('.//ul[@class="x-box"][2]/li[3]/span[@class="label-val"]/text()')
                        mj = panduan(mianji).strip()
                        print(mj)
                        lianjia_dict["占地面积"] = mj
                        #存入
                        p = PyMysql()
                        sql = 'insert into lianjia_data(img,name,price,area,address,shangquan,huxinimg,huxingdata,shoulouadd,kaifa,wuye,kaipan,jianzhutype,jiaofangtime,rongji,chanquan,lvhau,usernum,wuyefei,chewei,gn,gs,gd,jzarea) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(tupian,loupan,price_new,jianzhu,quyu,shangquan,imglist,huchao_list,sl,kf,wy,kp,wuyet,jf,rj,cq,lh,yh,wyf,cw,gn,gs,gd,mj)
                        p.sql_caozuo(sql)
                        # print(lianjia_dict)
                        lianjia_list.append(lianjia_dict)
                    # print(lianjia_list)
                # data =json.dumps(lianjia_list,ensure_ascii=False)
                # with open('lianjia.json','a',encoding='utf-8') as fp:
                #         fp.write(data)
    
    if __name__ == '__main__':
        proxies = {'http': '125.110.90.93:9000', 'https': ''}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
    
        url2 = 'https://ajax.api.lianjia.com/config/cityConfig/getConfig?callback=jQuery1111039566567830421073_1574853147759&type=province&category=1&_=1574853147760'
    
        p = PyMysql()
        ##删除语句
        # p.sql_caozuo('drop tables demo03')
        ##建表语句
        p.sql_caozuo(
            'create table lianjia_data(id int primary key auto_increment,img varchar(255),name varchar(255),price varchar(255),area varchar(255),address varchar(255),shangquan varchar(255),huxinimg varchar(5555),huxingdata varchar(5555),shoulouadd varchar(255),kaifa varchar(255),wuye varchar(255),kaipan varchar(255),jianzhutype varchar(255),jiaofangtime varchar(255),rongji varchar(255),chanquan varchar(255),lvhau varchar(255),usernum varchar(255),wuyefei varchar(255),chewei varchar(255),gn varchar(255),gs varchar(255),gd varchar(255),jzarea varchar(255))')
        ##插入数据语句
    
        pinyin(url2,headers,proxies)
    

    16.淘宝

    import requests
    import re
    import json
    from lxml import etree
    
    def panduan(obj):
        if obj:
            return obj
        else:
            return ''
    
    def taobao(url,headers,proxies):
        response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
        pattern = re.compile(r'\((.*)\)',re.S)
        data = pattern.findall(response)[0].strip()
        data = json.loads(data)
        data = data["result"]
        for k,v in data.items():
            dict = {}
            result = v["result"]
            for i in result:
    
                dict["价格"] = i["item_current_price"]
            # print(dict)
            # print(result)
    
    
    if __name__ == '__main__':
        datalist = [
            'https://tce.taobao.com/api/mget.htm?callback=jsonp1579&tce_sid=1870316,1871653&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
            'https://tce.taobao.com/api/mget.htm?callback=jsonp1666&tce_sid=1870321,1871654&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
            'https://tce.taobao.com/api/mget.htm?callback=jsonp1753&tce_sid=1870333,1871655&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
            'https://tce.taobao.com/api/mget.htm?callback=jsonp1840&tce_sid=1870340,1871656&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
            'https://tce.taobao.com/api/mget.htm?callback=jsonp1927&tce_sid=1870341,1871659&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
            'https://tce.taobao.com/api/mget.htm?callback=jsonp2014&tce_sid=1870342,1871657&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
            'https://tce.taobao.com/api/mget.htm?callback=jsonp2101&tce_sid=1870343,1871658&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online']
    
        proxies = {'http': '125.110.90.93:9000', 'https': ''}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
        for url in datalist:
            taobao(url,headers,proxies)
    

    17.网易

    
    # https://music.163.com/#/discover/artist/
    
    import requests
    import json
    import re
    from lxml import etree
    
    def wangyi(url,headers,proxies):
        response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
        # with open('wangyi.html','w',encoding='utf-8') as fp:
        #     fp.write(response)
        # 获取男女歌手链接
        tree = etree.HTML(response)
        sing_list = tree.xpath('//div[@id="singer-cat-nav"]//a/text()')
        # print(sing_list)
    
        #获取url
        sing_url_list = tree.xpath('//div[@id="singer-cat-nav"]//a/@href')
        # print(sing_url_list)
    
        for singer,singer_url in zip(sing_list,sing_url_list):
            sing_full_url = 'https://music.163.com'+singer_url
            response = requests.get(url=sing_full_url,headers=headers,proxies=proxies).content.decode('utf8')
            tree = etree.HTML(response)
            #提取字母链接
            letter_href_list = tree.xpath('//ul[@id="initial-selector"]/li[position()>1]/a/@href')
            for letter_url in letter_href_list:
                letter_ful_url = 'https://music.163.com'+letter_url
                response = requests.get(url=letter_ful_url,headers=headers,proxies=proxies).content.decode('utf8')
                tree = etree.HTML(response)
                #提取歌手名字
                name_list = tree.xpath(
                    '//ul[@class="m-cvrlst m-cvrlst-5 f-cb"]/li//a[@class="nm nm-icn f-thide s-fc0"]/text()')
                href_list = tree.xpath(
                    '//ul[@class="m-cvrlst m-cvrlst-5 f-cb"]/li//a[@class="nm nm-icn f-thide s-fc0"]/@href')
                for name, j in zip(name_list, href_list):
                    full_href = 'https://music.163.com' + j.strip()
                    print(name,full_href)
    if __name__ == '__main__':
        proxies = {'http': '125.110.90.93:9000', 'https': ''}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
        url = 'https://music.163.com/discover/artist/cat'
        wangyi(url,headers,proxies)

    18.智联+有界面 

    import requests
    import re
    import json
    from lxml import etree
    import time
    from time import strftime
    
    import pymysql
    import random
    
    class PyMysql:
        def __init__(self):
            ##1.链接数据库
            self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
            ##2.创建游标
            self.c = self.db.cursor()
        def sql_caozuo(self,sql):
            ##3.执行sql语句
            self.c.execute(sql)
            ##查看回执
            print(self.c.fetchall())
        ##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
        def __del__(self):
            self.db.commit()
            self.c.close()
            self.db.close()
    
    
    def zhilian(url,headers,proxies):
        response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
        data = json.loads(response)
        # print(data["data"]["results"])
        data_list = []
        for job in data["data"]["results"]:
            job_dict = {}
            # print(job)
    
            #岗位名称
            jobname = job["jobName"]
            print(jobname)
            job_dict["岗位名称"] = jobname
            #公司名称
            companyname = job["company"]["name"]
            print(companyname)
            job_dict["公司名称"] = companyname
            #公司人数
            companynum = job["company"]["size"]["name"]
            print(companynum)
            job_dict["公司人数"] = companynum
            #公司类型
            companytype = job["company"]["type"]["name"]
            print(companytype)
            job_dict["公司类型"] = companytype
            #技能要求
            positionLabel = job["positionLabel"]
            data = json.loads(positionLabel)
            skillLabel = data["skillLabel"]
            label_list = []
            try:
                if skillLabel[0]:
                    for label in skillLabel:
                        label = label["value"]
                        label_list.append(label)
            except:
                label_list.append("None")
            label = label_list
            job_dict["技能要求"] = label_list
    
            #详情url
            detail_url = job["positionURL"]
            print(detail_url)
            response = requests.get(url=detail_url,headers=headers,proxies=proxies).content.decode("utf8")
            # time.sleep(5)
            # print(response)
            tree = etree.HTML(response)
    
            # 职业描述
            zhize_list = tree.xpath('.//div[@class="describtion"]/div[@class="describtion__detail-content"]/text()')
            print(zhize_list)
            job_dict["职业描述"] = zhize_list
    
            # 工作地点
            display = job["city"]["display"]
            businessArea = job["businessArea"]
            bs = businessArea
            ds = display
            area = ds + bs
            print(area)
            job_dict["工作地点"] = area
            # 薪资水平
            salary = job["salary"]
            print(salary)
            job_dict["薪资水平"] = salary
            # 任职资格
            # 学历
            eduLevel = job["eduLevel"]["name"]
            # 工作经验
            workingExp = job["workingExp"]["name"]
            zige = eduLevel + workingExp
            print(zige)
            job_dict["任职资格"] = zige
            # 更新发布时间
            updateDate = job["updateDate"]
            fabutime = updateDate
            print(fabutime)
            job_dict["更新发布时间"] = fabutime
            #爬取人
            crawl_name = "高祎曼"
            print(crawl_name)
            job_dict["爬取人"] = crawl_name
            #爬取时间
            paqutime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
            print(paqutime)
            job_dict["爬取时间"] = paqutime
            data_list.append(job_dict)
            paquwangzhan = "智联"
            # p = PyMysql()
            # sql = 'insert into zhilian_data(岗位名称,公司名称,公司人数,公司类型,技能要求,职业描述,工作地点,薪资水平,任职资格,更新发布时间,爬取人,爬取时间,爬取网站) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(jobname,companyname,companynum,companytype,label,zhize_list,area,salary,zige,fabutime,crawl_name,paqutime,paquwangzhan)
            # p.sql_caozuo(sql)
    
        # data = json.dumps(data_list,ensure_ascii=False)
        # with open('zhilian.json','w',encoding='utf-8') as fp:
        #     fp.write(data)
    
    
    
    if __name__ == '__main__':
        proxies = {'http': '125.110.90.93:9000', 'https': ''}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
            'cookie':'guid=7575-be2f-7c15-c306; amap_ver=1536672475634; key=6a7665aa7301eae686d9e79884d0445b'}
        url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=all&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=python%E7%88%AC%E8%99%AB%E5%B7%A5%E7%A8%8B%E5%B8%88&kt=3&=0&_v=0.15635057&x-zp-page-request-id=fd5128b31ef84aceb1e84d41097963e1-1575100818571-464999&x-zp-client-id=04d803dc-74e4-4170-cf84-d3cdc5607b84&MmEwMD=4T6BXuvx_UOBZhGW5PAnis7UzYfOFZcUeeZ1Hq9jrdCULnycfzc9C7nw5eIOH2F4HihMYmky5F4y4wzH6vHJoJaLwKt2PX9n9eLVZASrAFRl3WXEqNzECxIt2if_0tHRqzNsYsSgp68KeoERtc2Pj8bQuiUJzPKbJJf5kIr_sReC3iAmhs5AkB.q55sS343U6.eC7.kfI2sb_.vTfBfDzJ6zSB_lgyMFXwMfmYMZMTDoKyG0JCeuMjUTVC1k0gGNhVbeLOCCzRjcSTvAKFfbPxnO.W5mSiOgXGCdZiT5Bxlz1piZ.6kc3N4J6QqObZblvoscyt.v0zGieC_8PYEqsj2ztAZNBcbreqmYxmNhVIQzljDX0LhKnaRdfJ5AowwtwYMUf9eWYMcdlkJkv9ton7pbV'
        # p = PyMysql()
        ##建表语句
        # p.sql_caozuo(
        #     'create table lianjiadata(id int primary key auto_increment,)')
        zhilian(url,headers,proxies)

    19.拉钩

    
    
    import pymysql
    import random
    
    class PyMysql:
        def __init__(self):
            ##1.链接数据库
            self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
            ##2.创建游标
            self.c = self.db.cursor()
        def sql_caozuo(self,sql):
            ##3.执行sql语句
            self.c.execute(sql)
            ##查看回执
            print(self.c.fetchall())
        ##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
        def __del__(self):
            self.db.commit()
            self.c.close()
            self.db.close()
    
    import requests
    from lxml import etree
    import json
    import re
    import math
    import time
    
    def panduan(obj):
        if obj:
            return obj[0]
        else:
            return 'None'
    
    
    def lagou(url,headers,proxies):
        time.sleep(3)
        response = requests.post(url=url,headers=headers,proxies=proxies).text
        # data_list = json.loads(response)
        print(response)
    
    
    
    
    
    
    if __name__ == '__main__':
        proxies = {'http': '125.110.90.93:9000', 'https': ''}
        usag_list = [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
            'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
        ]
        usag = random.choice(usag_list)
        headers = {
            'User-Agent': usag,
            'cookie':'user_trace_token=20191129155328-84048293-4801-4663-91ae-9695ac9c4693; _ga=GA1.2.1490834596.1575014072; LGUID=20191129155329-54ea0923-127d-11ea-a68e-5254005c3644; _gid=GA1.2.898573975.1575014080; index_location_city=%E5%8C%97%E4%BA%AC; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216eb627d70c176-0beb948187e8ff-32365f08-1049088-16eb627d70d272%22%2C%22%24device_id%22%3A%2216eb627d70c176-0beb948187e8ff-32365f08-1049088-16eb627d70d272%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; JSESSIONID=ABAAABAAADEAAFIA1EE71982D903A109AFEFF779585C4FA; WEBTJ-ID=20191130095146-16eba01bb9d47a-020b149bcc6827-32365f08-1049088-16eba01bb9e36d; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1575014080,1575030607,1575033330,1575078706; LGSID=20191130095042-d17fd70b-1313-11ea-a68e-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DwzB7mu7hBPcdl6Tni88-qBT6Hm86I74H5shBvm7Ugzi%26wd%3D%26eqid%3D962e999d0007d16a000000025de1caeb; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=search_code; SEARCH_ID=1a48505a36e843569e84500c2600b603; _gat=1; X_HTTP_TOKEN=440fb692ec96ea8037997057518ccbba7f70585ce7; LGRID=20191130101253-ead576cf-1316-11ea-a68e-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1575080037'
    
        }
    
        for i in range(1,8):
            url = 'https://www.shixiseng.com/app/interns/search/v2?build_time=1575096760616&page={}&keyword=python%E7%88%AC%E8%99%AB&type=intern&area=&months=&days=°ree=&official=&enterprise=&salary=-0&publishTime=&sortType=&city=%E5%8C%97%E4%BA%AC&internExtend='.format(i)
            lagou(url, headers, proxies)
        # p = PyMysql()
        ##建表语句
        # p.sql_caozuo(
        #     'create table lianjiadata(id int primary key auto_increment,img varchar(255),name varchar(255),price varchar(255),area varchar(255),address varchar(255),shangquan varchar(255),huxinimg varchar(5555),huxingdata varchar(5555),shoulouadd varchar(255),kaifa varchar(255),wuye varchar(255),kaipan varchar(255),jianzhutype varchar(255),jiaofangtime varchar(255),rongji varchar(255),chanquan varchar(255),lvhau varchar(255),usernum varchar(255),wuyefei varchar(255),chewei varchar(255),gn varchar(255),gs varchar(255),gd varchar(255),jzarea varchar(255))')
        ##插入数据语句
    
    
    

    20.无界面

    from selenium import webdriver
    import time
    
    # (1)将selenium和浏览器组合
    driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs-2.1.1-windows\bin\phantomjs.exe')
    
    # (2)访问网站
    url = 'https://www.baidu.com/'
    driver.get(url=url)
    
    # (3)拍照
    driver.save_screenshot('baidu.png')
    
    # (4)找到输入框,模拟输入:
    driver.find_element_by_id('kw').send_keys('吴亦凡')
    
    #(5)拍照:
    driver.save_screenshot('baidu02.png')
    
    # (6)模拟点击
    driver.find_element_by_id('su').click()
    time.sleep(3)
    driver.save_screenshot('baidu03.png')

     

     

    你可能感兴趣的:(爬虫(一)爬虫的20个小案例)