1.导入requests模块
import requests
2.获取百度url
url = "https://www.baidu.com/"
3.请求方式:request Method:GET
3.1做伪装,添加headers:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie':'BIDUPSID=F6BBCD59FE2A646812DB8DAE641A0BE5; PSTM=1573713375; BAIDUID=F6BBCD59FE2A6468D0329C1E2F60212F:FG=1; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1452_21098_29568_29221_26350; delPer=0; BD_CK_SAM=1; PSINO=2; H_PS_645EC=50d5uY51q2qJG%2BVlK7rlPmCgY73TcN9qKRz4sPKuBII1GIkIx4QkChitGd4; BDSVRTM=209'}
4.发送请求,返回响应
response = requests.get(url=url,headers=headers)
5.查看响应内容
(1)response.text:返回文本信息
(2)response.content 返回字节流信息 .content.decode('utf-8') 解码
6.写入本地文件
with open("baidu.html",'w',encoding="utf-8") as fp:
fp.write(response.content.decode("utf-8"))
import requests kw = input("请输入你要访问的贴吧名称:") url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}&fr=search&pn='.format(kw) headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',} for page in range(10): full_url = url+str(page*50) #发起请求 response = requests.get(url=full_url,headers=headers).content.decode("utf-8") #保存 with open("tieba{}.html".format(page+1),'w',encoding='utf-8')as fb: fb.write(response)===============补充内容:::
#get请求参数: params = { 'ie': 'utf-8', 'kw': 'python', 'fr': 'search', 'red_tag': 'y2156030250' } url = 'http://tieba.baidu.com/f?' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',} response = requests.request('get',url=url,params=params,headers=headers).content.decode('utf-8') print(response)
import requests
import json
def fanyi(kw):
# 1.url
url = 'https://fanyi.baidu.com/sug'
# 请求方式 POST
# 2.参数:
data = {'kw': kw}
# 3.请求
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}
response = requests.post(url=url,data=data,headers=headers).content.decode('utf-8')
response = json.loads(response)
for i in response['data']:
word = i["k"]
translate = i["v"]
print(word+":"+translate+'\n')
with open('{}.txt'.format(kw),'a',encoding='utf-8') as fp:
fp.write(word+":"+translate+'\n')
if __name__ == '__main__':
while True:
kw = input("请输入你要翻译的内容======>")
fanyi(kw)
import requests
# 1.url
url = 'http://www.renren.com/PLogin.do'
# method:POST
# 2.参数:
data = {
'email':'18811176939',
'password':'123457'
}
# 3.请求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
response = requests.post(url=url,data=data,headers=headers).content.decode('utf-8')
with open('renren.html','w',encoding='utf-8') as fp:
fp.write(response)
import requests
import random
#建立代理池
proxies = [
{'http':'124.113.217.5:9999','https':''},
{'http':'183.164.239.177','https':''}
]
#随机选择代理ip
prox = random.choice(proxies)
print(prox)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}
response = requests.get('https://www.kuaidaili.com/free/inha/',proxies=prox,headers=headers).content.decode('utf-8')
print(response)
import requests
import json
def weatherlist(url1,url2,headers,proxies):
response = requests.get(url=url1, headers=headers, proxies=proxies).content.decode('utf-8')
response = json.loads(response)
for i in response["data"]["cityByLetter"].values():
for j in i:
adcode = j["adcode"]
name = j["name"]
full_url = url2+adcode
response = requests.get(url=full_url, headers=headers, proxies=proxies).content.decode('utf-8')
response = json.loads(response)
print(response)
try:
if response["data"]["data"]:
for weather in response["data"]["data"]:
for weather in weather['forecast_data']:
weather_name = weather['weather_name']
temp_min = weather['min_temp']
temp_max = weather['max_temp']
with open('weather_list.txt', 'a', encoding='utf-8') as fp:
fp.write("城市:"+name+ " 天气: "+weather_name+" 最高气温: "+ temp_max
+" 最低气温: "+temp_min+'\n')
except:
print('空')
if __name__ == '__main__':
url1 = 'https://www.amap.com/service/cityList'
url2 = 'https://www.amap.com/service/weather?adcode='
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie':'BIDUPSID=F6BBCD59FE2A646812DB8DAE641A0BE5; PSTM=1573713375; BAIDUID=F6BBCD59FE2A6468D0329C1E2F60212F:FG=1; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1452_21098_29568_29221_26350; delPer=0; BD_CK_SAM=1; PSINO=2; H_PS_645EC=50d5uY51q2qJG%2BVlK7rlPmCgY73TcN9qKRz4sPKuBII1GIkIx4QkChitGd4; BDSVRTM=209'}
proxies = {'http':'124.113.217.5:9999','https':''}
weatherlist(url1,url2,headers,proxies)
import requests
import json
def fanyi(url,headers,proxies,data):
response = requests.post(url=url,headers=headers,proxies=proxies,data=data).content.decode('utf-8')
response = json.loads(response)
print(response)
if __name__ == '__main__':
url = 'http://fy.iciba.com/ajax.php?a=fy'
w = input("请输入你要翻译的单词=======>")
data = {
'f': 'auto',
't': 'auto',
'w': w
}
proxies = {'http': '125.110.90.93:9000', 'https': ''}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie': 'UM_distinctid=16e9cecbfbb12e-0701242418f796-54133310-100200-16e9cecbfbc135; CNZZDATA1256573702=520451556-1574586989-%7C1574586989; __gads=ID=80bd4d4328d6d249:T=1574590731:S=ALNI_MaVD1f5SOmn3mHzHr4qp3LOGH6REA','a': 'fy'}
fanyi(url,headers,proxies,data)
#第一种方法==========
import requests
url = 'http://www.renren.com/PLogin.do'
data = {
'email':'**********',
'password':'*******'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
response = requests.post(url=url,headers=headers,data=data).content.decode("utf-8")
url2 = 'http://www.renren.com/964508169/newsfeed/photo'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'Cookie': 'anonymid=k39t0b6ygzu4ll; depovince=GW; _r01_=1; jebe_key=a4314260-8a4a-4342-8995-c9e648a7a1b8%7C9a7ca8d077b15761409e1298ab30421f%7C1574406709891%7C1%7C1574406774042; JSESSIONID=abcn0y6W3hiuoaYCyCG6w; ick_login=0e033cf7-03c1-4f2d-87c4-3792223ab81b; first_login_flag=1; ln_uact=18811176939; ln_hurl=http://hdn.xnimg.cn/photos/hdn421/20191125/1040/main_v6a1_0aac000038d2195a.jpg; wp_fold=0; jebecookies=a12f9f8a-7b0f-413a-b5b9-17fc13c3e2e5|||||; _de=E1BF45DCCBECABDD1B5D679B401790AD; p=4eca4158940cc418e6f44861141b88f89; t=f0efd7f6c081ed1e0658bfbb470710c59; societyguester=f0efd7f6c081ed1e0658bfbb470710c59; id=964508169; xnsid=5f89fe84; ver=7.0; loginfrom=null; jebe_key=a4314260-8a4a-4342-8995-c9e648a7a1b8%7Cf096e2c5efb5a6aed844642618d7e763%7C1574650138110%7C1%7C1574650203722'}
response2 = requests.get(url=url2,headers=headers).content.decode('utf-8')
with open('ren.html','w',encoding='utf-8') as fp:
fp.write(response2)
#第二种方法
# (1)创建session对象
sess = requests.session()
# (2)模拟登陆,记录客户端身份信息
url1 = 'http://www.renren.com/PLogin.do'
data = {
'email':'18811176939',
'password':'123457'
}
sess.post(url = url1,data=data)
# (3)访问首页信息
url2 = 'http://www.renren.com/964508169/newsfeed/photo'
resonse = sess.get(url= url2).content.decode('utf-8')
with open ('renren.html','w',encoding='utf-8') as fp:
fp.write(resonse)
import requests
import json
def douban(url,headers,proxies):
content = requests.get(url=url, headers=headers, proxies=proxies).content.decode('utf-8')
movie_data = json.loads(content)
data_list = []
for movie in movie_data:
data_dict = {}
data_dict['title'] = movie.get('title')
data_dict['regions'] = movie.get('regions')
data_dict['types'] = movie.get('types')
data_dict['url'] = movie.get('url')
data_dict['actors'] = movie.get('actors')
data_list.append(data_dict)
json_data = json.dumps(data_list,ensure_ascii=False)
with open('moviedata.json','w',encoding='utf-8') as fp:
fp.write(json_data)
if __name__ == '__main__':
url = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=0&limit=20'
proxies = {'http': '125.110.90.93:9000', 'https': ''}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
douban(url,headers,proxies)
import requests
import json
import re
def maoyan(url,headers,proxies):
response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
pattern = re.compile(r'[\d\D]*? ')
movie_list = pattern.findall(response)
movielist = []
for movie in movie_list:
movie_dict = {}
#名字
pattern = re.compile(r'}">(.*?)')
title = pattern.findall(movie)[0]
movie_dict["title"] = title
print(title)
#排名
pattern = re.compile(r'')
rank = pattern.findall(movie)[0]
movie_dict["rank"] = rank
print(rank)
#评分
pattern = re.compile(r'(\d\.)(\d)
')
score = pattern.findall(movie)
score = score[0][0]+score[0][1]
movie_dict["score"] = score
print(score)
#图片
pattern = re.compile(r'
# 需求:
# 1、动物网站
# http://www.iltaw.com/animal/all动物世界
# 各种动物图片,中文名,英文名,学名,简介,食性,繁殖,习性,分
# 布,外形特征,生态习性,生长繁殖,地理分布地区,
import requests
import json
import re
#判断是否为空
def panduan(obj):
if obj:
return obj[0]
else:
return ''
def dongwu(url,headers,proxies):
content = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
pattern = re.compile(r'[\d\D]*? ')
animal_list = pattern.findall(content)
animallist = []
for animal in animal_list:
animal_dict = {}
#详情页链接
pattern = re.compile('')
detail = pattern.findall(animal)[0]
animal_dict["detail"] = detail
animallist.append(animal_dict)
#循环列表,取链接
animaldetails = []
for animal_href in animallist:
animaldict = {}
href = animal_href.get("detail")
response = requests.get(url=href,headers=headers,proxies=proxies).content.decode('utf-8')
#图片
pattern = re.compile('')
name = pattern.findall(response)
animaldict["中文名"] = panduan(name)
#英文名
pattern = re.compile('
英文名:(.*?);
')
engname = pattern.findall(response)
animaldict["英文名"] = panduan(engname)
#学名
pattern = re.compile('
学名:(.*?)')
xuename = pattern.findall(response)
animaldict["学名"] = panduan(xuename)
#简介
pattern = re.compile('。
(.*?)
import requests
import json
import re
def guba(url,headers,proxies):
response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
pattern = re.compile(r'[\w\W]*?
')
list = pattern.findall(response)[0]
li_pattern = re.compile(r'([\d\D]*?) ')
li_list = li_pattern.findall(list)
gubalist = []
for li in li_list:
dict = {}
pattern = re.compile(r'(.*?)',re.S)
data = pattern.findall(li)
print(data[0].strip())
print(data[1].strip())
dict["阅读量"] = data[0].strip()
dict["评论数"] = data[1].strip()
#名称
pattern = re.compile(r'.* class="balink">(.*?)]')
baname = pattern.findall(li)
print(baname[0])
dict["吧名称"] = baname[0]
#标题
pattern = re.compile(r'(.*?)')
author = pattern.findall(li)
print(author[0])
dict["作者"] = author[0]
#更新时间
pattern = re.compile(r'(.*?)')
time = pattern.findall(li)
print(time[0])
dict["更新时间"] = time[0]
#详情url
pattern = re.compile(r'
import requests
import re
import json
def yaofang(url,headers,proxies):
response = requests.get(url=url,headers=headers,proxies=proxies).content.decode( 'gbk')
pattern = re.compile(r'(.*?)',re.S)
name = pattern.findall(response)
if name:
dict["店名"] = name[0]
else:
dict["店名"] = "自营"
# detail_list.append(dict)
#详情
pattern = re.compile(r'[\w\W]*?')
table_list = pattern.findall(response)
for detail in table_list:
dict2 = {}
# print(detail)
pattern = re.compile(r'商品名称: .*?(.*?) ',re.S)
name = pattern.findall(detail)[0]
dict["商品名称"] = name
#品牌
pattern = re.compile(r'品 牌:.*?(.*?) ',re.S)
pinpai = pattern.findall(detail)[0]
dict["品牌"] = pinpai
#规格
pattern = re.compile(r'规 格:.*?(.*?) ', re.S)
guige = pattern.findall(detail)[0]
dict["规格"] = guige
# 规格
pattern = re.compile(r'重 量:.*?(.*?) ', re.S)
zhongliang = pattern.findall(detail)[0]
dict["重量"] = zhongliang
# 生产厂商
pattern = re.compile(r'生产厂商:.*?(.*?) ', re.S)
shengchan = pattern.findall(detail)[0]
dict["生产厂商"] = shengchan
# # 批准文号
# pattern = re.compile(r'批准文号:.*?(.*?) ', re.S)
# pizhun = pattern.findall(detail)[0].split()
# # dict2["批准文号"] = pizhun
# # print(pizhun)
detail_list.append(dict)
data = json.dumps(detail_list,ensure_ascii=False)
with open('yaofan.json','w',encoding='utf-8') as fp:
fp.write(data)
if __name__ == '__main__':
for i in range(1,6):
url = 'https://www.111.com.cn/categories/953710?tp=10-{}'.format(i)
proxies = {'http': '117.95.192.4:9999', 'https': ''}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'Cookie': '__cfduid=d45abc9c47821980f926a234bc1d141771574671242; PHPSESSID=1111adcbe2de581fc10d8912a9100db8; Hm_lvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; Hm_lpvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; UM_distinctid=16ea1b94f0f142-0edd2ae99214fd-54133310-100200-16ea1b94f108d2; CNZZDATA1000267376=387552791-1574669922-%7C1574669922'}
yaofang(url,headers,proxies)
from lxml import etree
import requests
import json
def maoyan(url,headers,proxies):
response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
tree = etree.HTML(response)
dd_list = tree.xpath('//dd')
list = []
for dd in dd_list:
dict = {}
#排名
paiming = dd.xpath('./i/text()')[0]
dict["排名"] = paiming
#名字
name = dd.xpath('./a/@title')[0]
dict["名字"] = name
#演员
star = dd.xpath('.//p[@class="star"]/text()')[0].strip().replace('主演:','')
dict["主演"] = star
#上映时间
time = dd.xpath('.//p[@class="releasetime"]/text()')[0].replace('上映时间:','')
dict["上映时间"] = time
#图片链接
img = dd.xpath('.//img[@class="board-img"]/@data-src')[0]
dict['图片链接'] = img
#详情链接
detail = dd.xpath('./a/@href')[0]
dict["详情链接"] = detail
list.append(dict)
data = json.dumps(list,ensure_ascii=False)
with open('maoyan.json','w',encoding='utf-8') as fp:
fp.write(data)
if __name__ == '__main__':
url = 'https://maoyan.com/board'
proxies = {'http': '125.110.90.93:9000', 'https': ''}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
maoyan(url,headers,proxies)
"""
链家网站爬取
一、案例需求:
链家:https://bj.fang.lianjia.com/loupan/
1、获取所有的城市的拼音
2、根据拼音去拼接url,获取所有的数据。
3、列表页:图片,楼盘名称,均价,建筑面积,区域,商圈
详情页:户型(["8室5厅8卫", "4室2厅3卫", "5室2厅2卫"]),朝向,图片(列表),用户点评(选爬)
项目地址:通燕高速耿庄桥北出口中化石油对面
售楼处地址:通燕高速北侧耿庄桥出口北500米潞苑南大街(接待时间 9:00 - 18:00)
开发商:石榴置业集团股份有限公司
物业公司:浙江省绿城物业服务有限公司
最新开盘:2018.01.20 转成时间戳
物业类型:别墅
交房时间:2016年05月10日 转成2016-05-10
容积率:1.20
产权年限:70年
绿化率:30%
规划户数:173
物业费用:6.91~7.13元/m2/月(不要后面的:元/m2/月)
车位情况:地下车位数584(只要数字)
供暖方式:集中供暖
供水方式:民水
供电方式:民电
建筑类型:板楼
嫌恶设施:暂无
占地面积:39,600㎡(不要单位)
建筑面积:40,000㎡(不要单位)
"""
import pymysql
class PyMysql:
def __init__(self):
##1.链接数据库
self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
##2.创建游标
self.c = self.db.cursor()
def sql_caozuo(self,sql):
##3.执行sql语句
self.c.execute(sql)
##查看回执
print(self.c.fetchall())
##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
def __del__(self):
self.db.commit()
self.c.close()
self.db.close()
import requests
from lxml import etree
import json
import re
import math
def panduan(obj):
if obj:
return obj[0]
else:
return 'None'
def pinyin(url2,headers,proxies):
response = requests.get(url=url2,headers=headers,proxies=proxies).text
response = str(response)
partten = re.compile(r'"short":"(.*?)"')
pinyin_list= partten.findall(response)
for pinyin in pinyin_list:
full_url = 'https://{}.fang.lianjia.com/loupan/'.format(pinyin)
try:
response1 = requests.get(url=full_url,headers=headers,proxiaes=proxies).content.decode('utf-8')
tree = etree.HTML(response1)
except:''
# 提取页数:
total_page = tree.xpath('//div[@class="page-box"]/@data-total-count')
for page in range(1, math.ceil(int(total_page[0]) / 10) + 1):
url = full_url+'pg{}'.format(page)
# print(url)
response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
tree = etree.HTML(response)
li_list = tree.xpath('//ul[@class="resblock-list-wrapper"]/li')
lianjia_list=[]
for li in li_list:
lianjia_dict = {}
# (1)图片:
src = li.xpath('.//img[@class="lj-lazy"]/@data-original')
print(panduan(src))
tupian = panduan(src)
lianjia_dict["图片链接"] = tupian
# (2) 楼盘名称,
title = li.xpath('./a/@title')
print(panduan(title))
loupan = panduan(title)
lianjia_dict["楼盘名称"] = loupan
# (3) # 均价,
price = li.xpath('.//span[@class="number"]/text()|.//span[@class="desc"]/text()')
price_new = ''.join(price).replace('\xa0', '')
print(price_new)
lianjia_dict["均价"] = price_new
# (4)建筑面积,
area_list = li.xpath('.//div[@class="resblock-area"]/span/text()')
print(panduan(area_list))
jianzhu = panduan(area_list)
lianjia_dict["建筑面积"] = jianzhu
#(5) 区域
add = li.xpath('.//div[@class="resblock-location"]/span/text()')
quyu = panduan(add)
print(panduan(add))
lianjia_dict["区域"] = quyu
#(6)商圈
shop = li.xpath('.//div[@class="resblock-location"]/a/text()')
print(panduan(shop))
shangquan = panduan(shop)
lianjia_dict["商圈"] = shangquan
#详情页url:
href = li.xpath('.//a[@class="resblock-room"]/@href')[0].replace("/loupan/","")
detail_url = full_url+href
# print(detail_url)
response = requests.get(url=detail_url,headers=headers,proxies=proxies).content.decode('utf-8')
detail_tree = etree.HTML(response)
# 访问详情页
# (5)户型
li_list = detail_tree.xpath('//li[@data-index="0"]/ul')
huxinglist = []
chaoxianglist = []
imglist = []
for li in li_list:
#户型
huxing = li.xpath('.//div[@class="content-title"]/text()')
huxinglist.append(panduan(huxing).strip())
#朝向
chaoxiang = li.xpath('.//div[@class="content-area"]/text()')
chaoxianglist.append(panduan(chaoxiang))
#img
img = li.xpath('.//img/@src')[0]
imglist.append(img)
print(imglist)
lianjia_dict["户型图片"] = imglist
#将户型朝向以键值对的形式存入字典中
huchao_list = []
for huxing,chaoxiang in zip(huxinglist,chaoxianglist):
dict = {}
dict[huxing] = chaoxiang
huchao_list.append(dict)
print(huchao_list)
lianjia_dict["户型朝向"] = huchao_list
#楼盘详情信息
xhref = detail_tree.xpath('.//div[@class="more-building"]/a/@href')[0]
href = full_url.replace('/loupan/','')+xhref
# print(href)
response_xinxi = requests.get(url=href, headers=headers,proxies=proxies).content.decode('utf-8')
xinxi_tree = etree.HTML(response_xinxi)
xinxi_list = xinxi_tree.xpath('//div[@class="big-left fl"]')
for xinxi in xinxi_list:
# 售楼处地址
shoulou = xinxi.xpath('.//ul[@class="x-box"][1]/li[@class="all-row"][2]/span[@class="label-val"]/text()')
sl = panduan(shoulou)
print(sl)
lianjia_dict["售楼处地址"] = sl
# 开发商
kaifa = xinxi.xpath('.//ul[@class="x-box"][1]/li[@class="all-row"][3]/span[@class="label-val"]/text()')
kf = panduan(kaifa)
print(kf)
lianjia_dict["开发商"] = kf
# 物业公司
wuye = xinxi.xpath('.//ul[@class="x-box"][3]//span[@class="label-val"]/text()')
wy = panduan(wuye)
print(wy)
lianjia_dict["物业公司"] = wy
# 最新开盘
kaipan = xinxi.xpath('./ul[@class="fenqi-ul"]/li[3]/span[@class="fq-td fq-open"]/span/text()')
kp = panduan(kaipan)
print(kp)
lianjia_dict["最新开盘"] = kp
# 物业类型
wuyetype = xinxi.xpath('.//ul[@class="x-box"][2]//span[@class="label-val"]/text()')
wuyet = panduan(wuyetype)
print(wuyet)
lianjia_dict["物业类型"] = wuyet
# 交房时间
jiaofangtime = xinxi.xpath('./ul[@class="fenqi-ul"]/li[@class="fq-nbd"]/span[@class="fq-td fq-open"]/span/text()')
jf = panduan(jiaofangtime)
print(jf)
lianjia_dict["交房时间"] = jf
# # 容积率
rongji = xinxi.xpath('./ul[@class="x-box"][2]/li[4]/span[@class="label-val"]/text()')
rj = panduan(rongji).strip()
print(rj)
lianjia_dict["容积率"] = rj
# 产权年限
chanquan = xinxi.xpath('.//ul[@class="x-box"][2]/li[8]/span[@class="label-val"]/text()')
cq = panduan(chanquan).strip()
print(cq)
lianjia_dict["产权年限"] = cq
# 绿化率
lvhua = xinxi.xpath('.//ul[@class="x-box"][2]/li[2]/span[@class="label-val"]/text()')
lh = panduan(lvhua).strip()
print(lh)
lianjia_dict["绿化率"] = lh
# 规划户数
yonghu = xinxi.xpath('.//ul[@class="x-box"][2]/li[7]/span[@class="label-val"]/text()')
yh = panduan(yonghu)
print(yh)
lianjia_dict["规划户数"] = yh
# 物业费用
wuyefei = xinxi.xpath('.//ul[@class="x-box"][3]/li[3]/span[@class="label-val"]/text()')
wyf = panduan(wuyefei)
print(wyf)
lianjia_dict["物业费用"] = wyf
# 车位情况
chewei = xinxi.xpath('.//ul[@class="x-box"][3]/li[7]/span[@class="label-val"]/text()')
cw = panduan(chewei).strip()
print(cw)
lianjia_dict["车位情况"] = cw
# 供暖方式
gongnuan = xinxi.xpath('.//ul[@class="x-box"][3]/li[4]/span[@class="label-val"]/text()')
gn = panduan(gongnuan)
print(gn)
lianjia_dict["供暖方式"] = gn
# 供水方式
gongshui = xinxi.xpath('.//ul[@class="x-box"][3]/li[5]/span[@class="label-val"]/text()')
gs = panduan(gongshui)
print(gs)
lianjia_dict["供水方式"] = gs
# 供电方式
gongdian = xinxi.xpath('.//ul[@class="x-box"][3]/li[6]/span[@class="label-val"]/text()')
gd = panduan(gongdian)
print(gd)
lianjia_dict["供电方式"] = gd
# # 嫌恶设施
# 占地面积
mianji = xinxi.xpath('.//ul[@class="x-box"][2]/li[3]/span[@class="label-val"]/text()')
mj = panduan(mianji).strip()
print(mj)
lianjia_dict["占地面积"] = mj
#存入
p = PyMysql()
sql = 'insert into lianjia_data(img,name,price,area,address,shangquan,huxinimg,huxingdata,shoulouadd,kaifa,wuye,kaipan,jianzhutype,jiaofangtime,rongji,chanquan,lvhau,usernum,wuyefei,chewei,gn,gs,gd,jzarea) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(tupian,loupan,price_new,jianzhu,quyu,shangquan,imglist,huchao_list,sl,kf,wy,kp,wuyet,jf,rj,cq,lh,yh,wyf,cw,gn,gs,gd,mj)
p.sql_caozuo(sql)
# print(lianjia_dict)
lianjia_list.append(lianjia_dict)
# print(lianjia_list)
# data =json.dumps(lianjia_list,ensure_ascii=False)
# with open('lianjia.json','a',encoding='utf-8') as fp:
# fp.write(data)
if __name__ == '__main__':
proxies = {'http': '125.110.90.93:9000', 'https': ''}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
url2 = 'https://ajax.api.lianjia.com/config/cityConfig/getConfig?callback=jQuery1111039566567830421073_1574853147759&type=province&category=1&_=1574853147760'
p = PyMysql()
##删除语句
# p.sql_caozuo('drop tables demo03')
##建表语句
p.sql_caozuo(
'create table lianjia_data(id int primary key auto_increment,img varchar(255),name varchar(255),price varchar(255),area varchar(255),address varchar(255),shangquan varchar(255),huxinimg varchar(5555),huxingdata varchar(5555),shoulouadd varchar(255),kaifa varchar(255),wuye varchar(255),kaipan varchar(255),jianzhutype varchar(255),jiaofangtime varchar(255),rongji varchar(255),chanquan varchar(255),lvhau varchar(255),usernum varchar(255),wuyefei varchar(255),chewei varchar(255),gn varchar(255),gs varchar(255),gd varchar(255),jzarea varchar(255))')
##插入数据语句
pinyin(url2,headers,proxies)
import requests
import re
import json
from lxml import etree
def panduan(obj):
if obj:
return obj
else:
return ''
def taobao(url,headers,proxies):
response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
pattern = re.compile(r'\((.*)\)',re.S)
data = pattern.findall(response)[0].strip()
data = json.loads(data)
data = data["result"]
for k,v in data.items():
dict = {}
result = v["result"]
for i in result:
dict["价格"] = i["item_current_price"]
# print(dict)
# print(result)
if __name__ == '__main__':
datalist = [
'https://tce.taobao.com/api/mget.htm?callback=jsonp1579&tce_sid=1870316,1871653&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
'https://tce.taobao.com/api/mget.htm?callback=jsonp1666&tce_sid=1870321,1871654&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
'https://tce.taobao.com/api/mget.htm?callback=jsonp1753&tce_sid=1870333,1871655&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
'https://tce.taobao.com/api/mget.htm?callback=jsonp1840&tce_sid=1870340,1871656&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
'https://tce.taobao.com/api/mget.htm?callback=jsonp1927&tce_sid=1870341,1871659&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
'https://tce.taobao.com/api/mget.htm?callback=jsonp2014&tce_sid=1870342,1871657&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
'https://tce.taobao.com/api/mget.htm?callback=jsonp2101&tce_sid=1870343,1871658&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online']
proxies = {'http': '125.110.90.93:9000', 'https': ''}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
for url in datalist:
taobao(url,headers,proxies)
# https://music.163.com/#/discover/artist/
import requests
import json
import re
from lxml import etree
def wangyi(url,headers,proxies):
response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
# with open('wangyi.html','w',encoding='utf-8') as fp:
# fp.write(response)
# 获取男女歌手链接
tree = etree.HTML(response)
sing_list = tree.xpath('//div[@id="singer-cat-nav"]//a/text()')
# print(sing_list)
#获取url
sing_url_list = tree.xpath('//div[@id="singer-cat-nav"]//a/@href')
# print(sing_url_list)
for singer,singer_url in zip(sing_list,sing_url_list):
sing_full_url = 'https://music.163.com'+singer_url
response = requests.get(url=sing_full_url,headers=headers,proxies=proxies).content.decode('utf8')
tree = etree.HTML(response)
#提取字母链接
letter_href_list = tree.xpath('//ul[@id="initial-selector"]/li[position()>1]/a/@href')
for letter_url in letter_href_list:
letter_ful_url = 'https://music.163.com'+letter_url
response = requests.get(url=letter_ful_url,headers=headers,proxies=proxies).content.decode('utf8')
tree = etree.HTML(response)
#提取歌手名字
name_list = tree.xpath(
'//ul[@class="m-cvrlst m-cvrlst-5 f-cb"]/li//a[@class="nm nm-icn f-thide s-fc0"]/text()')
href_list = tree.xpath(
'//ul[@class="m-cvrlst m-cvrlst-5 f-cb"]/li//a[@class="nm nm-icn f-thide s-fc0"]/@href')
for name, j in zip(name_list, href_list):
full_href = 'https://music.163.com' + j.strip()
print(name,full_href)
if __name__ == '__main__':
proxies = {'http': '125.110.90.93:9000', 'https': ''}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
url = 'https://music.163.com/discover/artist/cat'
wangyi(url,headers,proxies)
import requests
import re
import json
from lxml import etree
import time
from time import strftime
import pymysql
import random
class PyMysql:
def __init__(self):
##1.链接数据库
self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
##2.创建游标
self.c = self.db.cursor()
def sql_caozuo(self,sql):
##3.执行sql语句
self.c.execute(sql)
##查看回执
print(self.c.fetchall())
##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
def __del__(self):
self.db.commit()
self.c.close()
self.db.close()
def zhilian(url,headers,proxies):
response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
data = json.loads(response)
# print(data["data"]["results"])
data_list = []
for job in data["data"]["results"]:
job_dict = {}
# print(job)
#岗位名称
jobname = job["jobName"]
print(jobname)
job_dict["岗位名称"] = jobname
#公司名称
companyname = job["company"]["name"]
print(companyname)
job_dict["公司名称"] = companyname
#公司人数
companynum = job["company"]["size"]["name"]
print(companynum)
job_dict["公司人数"] = companynum
#公司类型
companytype = job["company"]["type"]["name"]
print(companytype)
job_dict["公司类型"] = companytype
#技能要求
positionLabel = job["positionLabel"]
data = json.loads(positionLabel)
skillLabel = data["skillLabel"]
label_list = []
try:
if skillLabel[0]:
for label in skillLabel:
label = label["value"]
label_list.append(label)
except:
label_list.append("None")
label = label_list
job_dict["技能要求"] = label_list
#详情url
detail_url = job["positionURL"]
print(detail_url)
response = requests.get(url=detail_url,headers=headers,proxies=proxies).content.decode("utf8")
# time.sleep(5)
# print(response)
tree = etree.HTML(response)
# 职业描述
zhize_list = tree.xpath('.//div[@class="describtion"]/div[@class="describtion__detail-content"]/text()')
print(zhize_list)
job_dict["职业描述"] = zhize_list
# 工作地点
display = job["city"]["display"]
businessArea = job["businessArea"]
bs = businessArea
ds = display
area = ds + bs
print(area)
job_dict["工作地点"] = area
# 薪资水平
salary = job["salary"]
print(salary)
job_dict["薪资水平"] = salary
# 任职资格
# 学历
eduLevel = job["eduLevel"]["name"]
# 工作经验
workingExp = job["workingExp"]["name"]
zige = eduLevel + workingExp
print(zige)
job_dict["任职资格"] = zige
# 更新发布时间
updateDate = job["updateDate"]
fabutime = updateDate
print(fabutime)
job_dict["更新发布时间"] = fabutime
#爬取人
crawl_name = "高祎曼"
print(crawl_name)
job_dict["爬取人"] = crawl_name
#爬取时间
paqutime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
print(paqutime)
job_dict["爬取时间"] = paqutime
data_list.append(job_dict)
paquwangzhan = "智联"
# p = PyMysql()
# sql = 'insert into zhilian_data(岗位名称,公司名称,公司人数,公司类型,技能要求,职业描述,工作地点,薪资水平,任职资格,更新发布时间,爬取人,爬取时间,爬取网站) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(jobname,companyname,companynum,companytype,label,zhize_list,area,salary,zige,fabutime,crawl_name,paqutime,paquwangzhan)
# p.sql_caozuo(sql)
# data = json.dumps(data_list,ensure_ascii=False)
# with open('zhilian.json','w',encoding='utf-8') as fp:
# fp.write(data)
if __name__ == '__main__':
proxies = {'http': '125.110.90.93:9000', 'https': ''}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'cookie':'guid=7575-be2f-7c15-c306; amap_ver=1536672475634; key=6a7665aa7301eae686d9e79884d0445b'}
url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=all&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=python%E7%88%AC%E8%99%AB%E5%B7%A5%E7%A8%8B%E5%B8%88&kt=3&=0&_v=0.15635057&x-zp-page-request-id=fd5128b31ef84aceb1e84d41097963e1-1575100818571-464999&x-zp-client-id=04d803dc-74e4-4170-cf84-d3cdc5607b84&MmEwMD=4T6BXuvx_UOBZhGW5PAnis7UzYfOFZcUeeZ1Hq9jrdCULnycfzc9C7nw5eIOH2F4HihMYmky5F4y4wzH6vHJoJaLwKt2PX9n9eLVZASrAFRl3WXEqNzECxIt2if_0tHRqzNsYsSgp68KeoERtc2Pj8bQuiUJzPKbJJf5kIr_sReC3iAmhs5AkB.q55sS343U6.eC7.kfI2sb_.vTfBfDzJ6zSB_lgyMFXwMfmYMZMTDoKyG0JCeuMjUTVC1k0gGNhVbeLOCCzRjcSTvAKFfbPxnO.W5mSiOgXGCdZiT5Bxlz1piZ.6kc3N4J6QqObZblvoscyt.v0zGieC_8PYEqsj2ztAZNBcbreqmYxmNhVIQzljDX0LhKnaRdfJ5AowwtwYMUf9eWYMcdlkJkv9ton7pbV'
# p = PyMysql()
##建表语句
# p.sql_caozuo(
# 'create table lianjiadata(id int primary key auto_increment,)')
zhilian(url,headers,proxies)
import pymysql
import random
class PyMysql:
def __init__(self):
##1.链接数据库
self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
##2.创建游标
self.c = self.db.cursor()
def sql_caozuo(self,sql):
##3.执行sql语句
self.c.execute(sql)
##查看回执
print(self.c.fetchall())
##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
def __del__(self):
self.db.commit()
self.c.close()
self.db.close()
import requests
from lxml import etree
import json
import re
import math
import time
def panduan(obj):
if obj:
return obj[0]
else:
return 'None'
def lagou(url,headers,proxies):
time.sleep(3)
response = requests.post(url=url,headers=headers,proxies=proxies).text
# data_list = json.loads(response)
print(response)
if __name__ == '__main__':
proxies = {'http': '125.110.90.93:9000', 'https': ''}
usag_list = [
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
]
usag = random.choice(usag_list)
headers = {
'User-Agent': usag,
'cookie':'user_trace_token=20191129155328-84048293-4801-4663-91ae-9695ac9c4693; _ga=GA1.2.1490834596.1575014072; LGUID=20191129155329-54ea0923-127d-11ea-a68e-5254005c3644; _gid=GA1.2.898573975.1575014080; index_location_city=%E5%8C%97%E4%BA%AC; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216eb627d70c176-0beb948187e8ff-32365f08-1049088-16eb627d70d272%22%2C%22%24device_id%22%3A%2216eb627d70c176-0beb948187e8ff-32365f08-1049088-16eb627d70d272%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; JSESSIONID=ABAAABAAADEAAFIA1EE71982D903A109AFEFF779585C4FA; WEBTJ-ID=20191130095146-16eba01bb9d47a-020b149bcc6827-32365f08-1049088-16eba01bb9e36d; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1575014080,1575030607,1575033330,1575078706; LGSID=20191130095042-d17fd70b-1313-11ea-a68e-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DwzB7mu7hBPcdl6Tni88-qBT6Hm86I74H5shBvm7Ugzi%26wd%3D%26eqid%3D962e999d0007d16a000000025de1caeb; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=search_code; SEARCH_ID=1a48505a36e843569e84500c2600b603; _gat=1; X_HTTP_TOKEN=440fb692ec96ea8037997057518ccbba7f70585ce7; LGRID=20191130101253-ead576cf-1316-11ea-a68e-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1575080037'
}
for i in range(1,8):
url = 'https://www.shixiseng.com/app/interns/search/v2?build_time=1575096760616&page={}&keyword=python%E7%88%AC%E8%99%AB&type=intern&area=&months=&days=°ree=&official=&enterprise=&salary=-0&publishTime=&sortType=&city=%E5%8C%97%E4%BA%AC&internExtend='.format(i)
lagou(url, headers, proxies)
# p = PyMysql()
##建表语句
# p.sql_caozuo(
# 'create table lianjiadata(id int primary key auto_increment,img varchar(255),name varchar(255),price varchar(255),area varchar(255),address varchar(255),shangquan varchar(255),huxinimg varchar(5555),huxingdata varchar(5555),shoulouadd varchar(255),kaifa varchar(255),wuye varchar(255),kaipan varchar(255),jianzhutype varchar(255),jiaofangtime varchar(255),rongji varchar(255),chanquan varchar(255),lvhau varchar(255),usernum varchar(255),wuyefei varchar(255),chewei varchar(255),gn varchar(255),gs varchar(255),gd varchar(255),jzarea varchar(255))')
##插入数据语句
from selenium import webdriver
import time
# (1)将selenium和浏览器组合
driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs-2.1.1-windows\bin\phantomjs.exe')
# (2)访问网站
url = 'https://www.baidu.com/'
driver.get(url=url)
# (3)拍照
driver.save_screenshot('baidu.png')
# (4)找到输入框,模拟输入:
driver.find_element_by_id('kw').send_keys('吴亦凡')
#(5)拍照:
driver.save_screenshot('baidu02.png')
# (6)模拟点击
driver.find_element_by_id('su').click()
time.sleep(3)
driver.save_screenshot('baidu03.png')