import requests
from pyquery import PyQuery as pq
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent':'Mozilla/(Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML,like Gecko)'
'Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url,headers=headers).text
doc = pq(html)
items = doc('.explore-tab .feed-item').items()
for item in items:
question = item.find('h2').text()
author = item.find('.author-link-line').text()
answer = pq(item.find('.content').html()).text()
file = open('explore.txt','a',encoding='utf-8')
file.write('\n'.join([question,author,answer]))
file.write('\n' + '=' * 50 + '\n')
file.close()
对象:{}包裹起来的内容,数据结构为{key1:value1,key2: value2}的键值对结构,
数组:[]包裹起来,数据结构为[“java”,“javascripr”]的索引结构
import json
str='''
[{
"name": "Bob",
"gender": "male",
"birthday": "1992-10-18"
}, {
"name": "Selina",
"gender": "female",
"birthday": "1995-10-18"
}]
'''
print(type(str))
data=json.loads(str, strict=False)
print(data)
print(type(data))
#获取第一个元素里的name属性:
data[0]['name']
data[0].get('name')#如果键名不存在不会报错
data[0].get('age',25)#第二个参数:若键名不存在返回该默认值
这里使用loads()方法将字符串转为JSON对象
特别注意json格式
with open('test.json','r') as file:#with as可以自动关闭文件
str=file.read()
data=json.loads(str)
print(data)
data=[{
'name':'Bob',
'gender':'male',
'birthday':'1992-10-18'
},{
'name':'王伟',
'gender':'男',
'birthday':'1992-10-18'
}]
with open('test.json','w',encoding='utf-8')as file:
file.write(json.dumps(data,indent=2,ensure_ascii=False))
indent:代表缩进字符个数
encoding:编码方式
ensure_ascii:json.dumps 序列化时对中文默认使用的ascii,需要指定ensure_ascii=False才能输出真正的中文:
其文件以纯文本形式存储表格数据,该文加是一个字符序列,可以由任意数目的记录组成,记录间以某种换行符分隔。
import csv
with open('data.csv','w') as csvfile:
writer=csv.writer(csvfile,delimiter=' ')#修改与列之间的分隔符
writer.writerow(['id','name','age'])
writer.writerow(['10001','Mike','20'])
#同时写入多行
writer.writerow([['10002','Mike',20],['10003','Bob',22]])
#字典写入
fieldnames=['id','name','age']
writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writerheader()
writer.writerow({
'id': '10004','name':'Jordan','age':21})
先定义3个字段,用fieldames表示,然后将其传给DictWriter来初始化一个字典写入对象,然后可以调用writerheader()方法先写入头信息
import csv
with open('data.csv','r',encoding='utf-8') as csvfile:
reader=csv.reader(csvfile)
for row in reader:
print(row)
import pandas as pd
df=pd.read_csv('data.csv',error_bad_lines=False)
print(df)
import pymysql
db=pymysql.connect(host='localhost',user='root',password='123456',port=3306)
cursor=db.cursor()
cursor.execute('SELECT VERSION()')
data=cursor.fetchone()
print('Database version:',data)
cursor.execute("CREATE DATABASE spiders DEFAULT CHARACTER SET utf8")
db.close()
connect()声明一个MySQL连接对象传入MySQL运行的IP既host
fetchone获得第一条数据。
import pymysql
id='20120001'
user='Bob'
age=20
db=pymysql.connect(host='localhost',user='root',password='123456',port=3306)
cursor=db.cursor()
sql='INSERT INTO student(id,name,age) values(%S,%s,%s)'
try:
cursor.execute(sql,(id,user,age))
db.commit()
except:
db.rollback()
db.close()
插入更新和删除操作都是对数据库进行更改的操作,而更改操作都必须为一个事物,所以这些操作的标准写法就是
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
import pymysql
data={
'id':'20120001',
'name':'Bob',
'age':20
}
table='students'
keys=', '.join(data.keys())
values=', '.join(['%s']*len(data))#构造占位符,乘法扩充
sql='INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table,keys=keys,values=values)
db=pymysql.connect(host='localhost',user='root',password='123456',port=3306)
cursor=db.cursor()
try:
if cursor.execute(sql, tuple(data.values())):
printf('Successful')
db.commit()
except:
print('Failed')
db.rollback()
db.close()
sql='INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE '.format(table=table,keys=keys,values=values)
update =','.join([" {key} =%s".format(key=key) for key in data])
sql+=update
try:
if cursor.execute(sql,tuple(data.values())*2):
print('Successful')
db.commit()
except:
print('Faild')
db.rollback()
db.close()
完整的sql:
INSERT INTO students(id,name,age) VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE id=%s,name=%s,age=%s
因为这里编程了6个%s所以execute的第二个参数就需要乘以2
sql='SELECT *FROM students WHERE age>=20'
try:
cursor.execute(sql)
print('Count:',cursor.rowcount)
one=cursor.fetchone()
print('One:',one)
results=cursor.fetchall()
for row in results:
print(row)
except:
print('Error')
client=pymongo.MongoClient(host='localhost',port=27017)
#或者
#client=pymongo.MongoClient('mongodb://localhost:27017/')
#指定数据库
db=client.test
#指定集合(相当于表)
collection=db.students
#插入数据
student1={
'id':'20170101',
'name':'Jordan',
'age':20,
'gender':'male'
}
student2={
'id':'20170202',
'name':'Mike',
'age':21,
'gender':'male'
}
result=collection.insert_many([student1,student2])
print(result)
#查询
result=collection.find_one({
'name':'Mike'})#返回结果为字典类型
#查询多条数据find()
print(type(result))
print(result)
#计数
count=collection.find().count()
count=collection.find({
'age':20}).count()
#排序
results=collection.find().sort('name',pymongo.ASCENDING)#升序
#偏移
results=collection.find().sort('name',pymongo.ASCENDING).skip(2).limit(2)
#更新
condition={
'name':'Kevin'}
student=collection.find_one(condition)#先将数据查询出来
student['age']=25#修改
result=collection.update(condition,student)#更新,返回字典形式
result=collection.update(condition,{
'$set':student})(1)
#推荐用法:
result=collection.update_one(condition,{
'$set':student})返回UpdateResult
print(result.matched_count,result.modified_count)
#删除
result=collection.delete_one({
'name':'Kevin'})
result=collection.delete_many({
'age':{
'$lt':25}})
(1) 中只更新student字典内存在的字段,如果原先还有其他字段,则不会更新,也不会删除,而不用$set的话,则会把之前的数据全部用student字典替换,如果存在其他字段,则会被删除
matched_count获得匹配的数据条数
modified_count:获得影响条数
错误:redis.exceptions.AuthenticationError: Client sent AUTH, but no password is set
因为redis默认是没有密码的
进入redis-cli.exe,设置密码
#连接Redis
from redis import StrictRedis,ConnectionPool
redis=StrictRedis(host='localhost',port=6379,db=0,password='foobared')
#或者
pool=ConnectionPool(host='localhost',port=6379,db=0,password='foobared')
#url连接
url='redis://:foobared@localhost:6379/0'
pool=ConnectionPool.from_url(url)
redis=StrictRedis(connection_pool=pool)
redis.set('name','Bob')
print(redis.get('name'))
redis-dump:用于导出数据
redis-load:用于导入数据