首先头部先引入库:
import os
import requests
from pyquery import PyQuery as pq
import pymysql # 用于连接并操作MySQL数据库
引入头部,每个网站的User-Agent不同,需要提前打开网址去找User-Agent,比如我爬取信息的网站是:https://www.icourse163.org/university/view/all.htm#/用火狐打开后右击查看元素,点击网络,找到圈红色的部分出来
然后插入代码:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36'
}
接着写搜索函数,这个函数主要是发送请求,有回应:
def search():
url = 'https://www.icourse163.org/university/view/all.htm#/'
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except requests.ConnectionError:
return None
写获取想要的信息的位置:
def get_image(html):
doc = pq(html)
items = doc('.g-flow .u-usitys .u-usity ').items() # 如果不加.items(),items就不是pyquery对象
for item in items:
yield {
'title': item.find('img').attr('alt'),
'image': item.find('img').attr('src')
}
可能有小伙伴不知道怎么找到这些信息的位置,用火狐打开网站,点击相应的信息右键,查看元素:
然后写存储图片的函数:
def save_image(item):
file_path_all = 'D:/python-pro/partners'#存储图片的本地地址
if not os.path.exists(file_path_all):
os.makedirs(file_path_all)
try:
response = requests.get(item.get('image'))
if response.status_code == 200:
file_path = '{0}/{1}.{2}'.format(file_path_all, item.get('title'), 'jpg') # 以图片名字命名的话,可能会有重复的,造成图片丢失
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Download', file_path)
except requests.ConnectionError:
print('Failed to Save Image')
最后写Main函数:
def main():
connection = pymysql.connect(host='localhost', # 连接数据库
user='root',
password='', # 你安装mysql时设置的密码
db='test2',
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
sql = "insert into partners(name,url,parId)values(%s,%s,%s)"
try:
html = search()
cursor = connection.cursor()
i = 1
for item in get_image(html):
print([i]+[item])
save_image(item)
cursor.execute(sql, (item['title'], item['image'], i))
connection.commit()
i += 1
finally:
connection.close()
return None
if __name__ == '__main__':
main()
运行结果如下:
存入本地的图片:
存入数据库的数据:
最后把所有代码附上:
import os
import requests
from pyquery import PyQuery as pq
import pymysql # 用于连接并操作MySQL数据库
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36'
}
def search():
url = 'https://www.icourse163.org/university/view/all.htm#/'
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except requests.ConnectionError:
return None
def get_image(html):
doc = pq(html)
items = doc('.g-flow .u-usitys .u-usity ').items() # 如果不加.items(),items就不是pyquery对象
for item in items:
yield {
'title': item.find('img').attr('alt'),
'image': item.find('img').attr('src')
}
def save_image(item):
file_path_all = 'D:/python-pro/partners'#存储图片的本地地址
if not os.path.exists(file_path_all):
os.makedirs(file_path_all)
try:
response = requests.get(item.get('image'))
if response.status_code == 200:
file_path = '{0}/{1}.{2}'.format(file_path_all, item.get('title'), 'jpg') # 以图片名字命名的话,可能会有重复的,造成图片丢失
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Download', file_path)
except requests.ConnectionError:
print('Failed to Save Image')
def main():
connection = pymysql.connect(host='localhost', # 连接数据库
user='root',
password='', # 你安装mysql时设置的密码
db='test2',
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
sql = "insert into partners(name,url,parId)values(%s,%s,%s)"
try:
html = search()
cursor = connection.cursor()
i = 1
for item in get_image(html):
print([i]+[item])
save_image(item)
cursor.execute(sql, (item['title'], item['image'], i))
connection.commit()
i += 1
finally:
connection.close()
return None
if __name__ == '__main__':
main()
注意事项: