爬取目标http://www.instagram.com,采集主题为"network security"的帖子信息
# -*- coding: utf-8 -*-
import requests
import json
import urllib
import re
import time
import csv
import codecs
import pymysql
s = requests.session()
s.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
}
search = "networksecurity" #search里边是搜索内容
website = "http://www.instagram.com"
q = urllib.parse.quote(search)
print(q)
url1 = website+"/explore/tags/"+q+"/?__a=1"
requests.adapters.DEFAULT_RETRIES = 5
html = s.get(url1)
ans = json.loads(html.text)
pgn = 0
########################################
# 添加数据库
db = pymysql.connect(host="localhost", port=3306, user="mysql账户名", password="mysql登录密码", db="Instagram", charset="utf8")
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS Info")
createTable = """CREATE TABLE Info(
Username VARCHAR (50) NOT NULL ,
PostsNumber VARCHAR (8) NOT NULL ,
FansNumber VARCHAR (8) NOT NULL ,
AttentionNumber VARCHAR (8) NOT NULL ,
Ptime VARCHAR (20) NOT NULL ,
PicURL VARCHAR (500) NOT NULL ,
CommentNumber VARCHAR (8) NOT NULL ,
LikeNumber VARCHAR (8) NOT NULL) ENGINE MyISAM DEFAULT CHARSET=utf8"""
cursor.execute(createTable)
f = open("./Save/"+str(search)+".txt","w",encoding='utf-8')
csvfile = codecs.open("./Save/"+str(search)+".csv", 'wb',encoding='gb18030')
result = []
########################################
writer = csv.writer(csvfile)
data=['用户名', '用户简介', '已发帖', '粉丝数', '关注的人数', '发布时间', '图片url', '内容', '评论数', '点赞数']
writer.writerow(data)
#
#
#
edges = ans['graphql']['hashtag']['edge_hashtag_to_top_posts']['edges']
n = 0
for i in range (len(edges)):
temp_dict = {}
if len(edges[i]['node']['edge_media_to_caption']['edges']) == 0:
continue
d = edges[i]['node']['edge_media_to_caption']['edges'][0]['node']['text']
shortcode = edges[i]['node']['shortcode']
url2 = website+"/p/"+shortcode+"/?__a=1"
getnt = s.get(url2, verify=False)
getnt = json.loads(getnt.text)
username = getnt['graphql']['shortcode_media']['owner']['username']
ptime = getnt['graphql']['shortcode_media']['taken_at_timestamp']
commentNumber = getnt['graphql']['shortcode_media']['edge_media_to_parent_comment']['count']
likeNumber = getnt['graphql']['shortcode_media']['edge_media_preview_like']['count']
picurl = getnt['graphql']['shortcode_media']['display_url']
ptime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ptime))
##############################test
userurl = website + '/' +username + "/?__a=1"
print("userurl:", userurl)
userhtml = s.get(userurl)
userhtml = userhtml.text
shared_data = userhtml.split("window._sharedData = ")[0].split(";")[0]
shared_data = json.loads(shared_data)
#print(shared_data)
postsNumber = shared_data['graphql']['user']['edge_owner_to_timeline_media']['count'] #用户发布的帖子数
fansNumber = shared_data['graphql']['user']['edge_followed_by']['count'] #用户粉丝数
attentionNumber = shared_data['graphql']['user']['edge_follow']['count'] #用户关注的人数
biography = shared_data['graphql']['user']['biography'] #用户简介
###################
print (username)
print (ptime)
print (d)
print("comment Number:" + str(commentNumber))
print("like number:",getnt['graphql']['shortcode_media']['edge_media_preview_like']['count'])
print("picture url:" + picurl)
data = [username,ptime,re.sub(r'\s+',' ', d)]
writer.writerow(data)
temp_dict['author'] = username
temp_dict['date'] = ptime
temp_dict['comment'] = re.sub(r'\s+',' ', d)
result.append(temp_dict)
f.writelines("用户名:")
f.writelines(username)
f.writelines('\n')
f.writelines("用户简介:")
f.writelines(biography)
f.writelines('\n')
f.writelines("已发帖:")
postsNum = str(postsNumber)
f.writelines(postsNum)
f.writelines('\n')
f.writelines("粉丝数:")
fansNum = str(fansNumber)
f.writelines(fansNum)
f.writelines('\n')
f.writelines("关注的人数:")
attentionNum = str(attentionNumber)
f.writelines(attentionNum)
f.writelines('\n')
f.writelines("发布时间:")
f.writelines(ptime)
f.writelines('\n')
f.writelines("图片url:")
picU = str(picurl)
f.writelines(picU)
f.writelines('\n')
f.writelines("内容:")
content = re.sub(r'\s+', ' ', d)
f.writelines(content)
f.writelines('\n')
f.writelines("评论数:")
commentNum = str(commentNumber)
f.writelines(commentNum)
f.writelines('\n')
f.writelines("点赞数:")
likeNum = str(likeNumber)
f.writelines(likeNum)
f.writelines('\n')
f.writelines('\n')
data = [username, biography, postsNumber, fansNumber, attentionNumber, ptime, picurl,
re.sub(r'\s+', ' ', d), commentNumber, likeNumber]
writer.writerow(data)
sql = """INSERT INTO Info (Username, PostsNumber, FansNumber, AttentionNumber, Ptime, PicURL, CommentNumber, LikeNumber) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""
val = (username, postsNum, fansNum, attentionNum, ptime, picU, commentNum, likeNum)
cursor.execute(sql, val)
db.commit()
print("写入数据库成功!")
#except:
# db.rollback()
b = ans['graphql']['hashtag']["edge_hashtag_to_media"]
hnp = b['page_info']['has_next_page']
hashn = b['page_info']['end_cursor']
print (hnp,hashn)
# __
# 下面的设置为 pgn != -1时,将抓取所有结果,可能抓取到的结果十分庞大
while hnp == True and pgn != 300:
pgn = pgn + 1
url1 = website+"/graphql/query/?query_hash=298b92c8d7cad703f7565aa892ede943&variables=%7B%22tag_name%22%3A%22"+q+"%22%2C%22first%22%3A6%2C%22after%22%3A%22"+hashn+"%22%7D"
print (url1)
html = s.get(url1, verify=False)
try:
ans = json.loads(html.text)
except:
v = open("bug.txt","w")
v.writelines(html.text)
v.close()
print ("ERROR")
url1 = website+"/graphql/query/?query_hash=298b92c8d7cad703f7565aa892ede943&variables=%7B%22tag_name%22%3A%22"+q+"%22%2C%22first%22%3A6%2C%22after%22%3A%22"+hashn+"%22%7D"
# print (url1)
html = s.get(url1, verify=False)
ans = json.loads(html.text)
# continue;
try:
edges = ans['data']['hashtag']['edge_hashtag_to_media']['edges']
except:
v = open("bug.txt","w")
v.writelines(html.text)
v.close()
print ("ERROR")
url1 = website+"/graphql/query/?query_hash=298b92c8d7cad703f7565aa892ede943&variables=%7B%22tag_name%22%3A%22"+q+"%22%2C%22first%22%3A6%2C%22after%22%3A%22"+hashn+"%22%7D"
# print (url1)
html = s.get(url1, verify=False)
ans = json.loads(html.text)
edges = ans['data']['hashtag']['edge_hashtag_to_media']['edges']
for i in range (len(edges)):
temp_dict = {}
# print ((len(edges)))
if len(edges[i]['node']['edge_media_to_caption']['edges']) == 0:
continue
d = edges[i]['node']['edge_media_to_caption']['edges'][0]['node']['text']
shortcode = edges[i]['node']['shortcode']
url2 = website+"/p/"+shortcode+"/?__a=1"
getnt = s.get(url2, verify=False)
try:
getnt = json.loads(getnt.text)
except:
url2 = website+"/p/"+shortcode+"/?__a=1"
getnt = s.get(url2, verify=False)
getnt = json.loads(getnt.text)
username = getnt['graphql']['shortcode_media']['owner']['username']
ptime = getnt['graphql']['shortcode_media']['taken_at_timestamp']
nd = re.sub(r'\s+', ' ', d)
commentNumber = getnt['graphql']['shortcode_media']['edge_media_to_parent_comment']['count']
likeNumber = getnt['graphql']['shortcode_media']['edge_media_preview_like']['count']
picurl = getnt['graphql']['shortcode_media']['display_url']
ptime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ptime))
##############################test
userurl = website + '/' + username + "/?__a=1"
print("userurl:", userurl)
userhtml = s.get(userurl)
userhtml = userhtml.text
shared_data = userhtml.split("window._sharedData = ")[0].split(";")[0]
shared_data = json.loads(shared_data)
# print(shared_data)
postsNumebr = shared_data['graphql']['user']['edge_owner_to_timeline_media']['count'] # 用户发布的帖子数
fansNumber = shared_data['graphql']['user']['edge_followed_by']['count'] # 用户粉丝数
attentionNumber = shared_data['graphql']['user']['edge_follow']['count'] # 用户关注的人数
biography = shared_data['graphql']['user']['biography'] # 用户简介
###################
f.writelines("用户:")
f.writelines(username)
f.writelines('\n')
f.writelines("用户简介:")
f.writelines(biography)
f.writelines('\n')
f.writelines("已发帖:")
postsNum = str(postsNumebr)
f.writelines(postsNum)
f.writelines('\n')
f.writelines("粉丝数:")
fansNum = str(fansNumber)
f.writelines(fansNum)
f.writelines('\n')
f.writelines("关注的人数:")
attentionNum = str(attentionNumber)
f.writelines(attentionNum)
f.writelines('\n')
f.writelines("发布时间:")
f.writelines(ptime)
f.writelines('\n')
f.writelines("图片url:")
picU = str(picurl)
f.writelines(picU)
f.writelines('\n')
f.writelines("内容:")
f.writelines(re.sub(r'\s+', ' ', d))
f.writelines('\n')
f.writelines("评论数:")
commentNum = str(commentNumber)
f.writelines(commentNum)
f.writelines('\n')
f.writelines("点赞数:")
likeNum = str(likeNumber)
f.writelines(likeNum)
f.writelines('\n')
f.writelines('\n')
data = [username, biography, postsNumebr, fansNumber, attentionNumber, ptime, picurl,
re.sub(r'\s+', ' ', d), commentNumber, likeNumber]
writer.writerow(data)
n = n + 1
sql = "INSERT INTO Info (Username, PostsNumber, FansNumber, AttentionNumber, Ptime, PicURL, CommentNumber, LikeNumber) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
val = (username, postsNum, fansNum, attentionNum, ptime, picU, commentNum, likeNum)
cursor.execute(sql, val)
db.commit()
print("写入数据库成功!")
#except:
# db.rollback()
b = ans['data']['hashtag']["edge_hashtag_to_media"]
hnp = b['page_info']['has_next_page']
hashn = b['page_info']['end_cursor']
print (hnp,hashn,pgn,len(edges))
f.close()
csvfile.close()
db.close()
一个简单的ins爬虫,需要确保在开始之前开了全局代理哈 毕竟科学上网
Instagram 是一款运行在移动端上的以图片为主的社交应用
请务必要在开始运行代码之前确定是全局代理
ins个人主页上的图片是通过js动态生成的,这就意味着我们没有办法通过requests简单的获得所有所需的URL,使用request模块get到的内容仅仅是网页框架和一堆js脚本,ins里的数据是通过Ajax异步请求过来的。在进行网页分析时发现图片的连接以json的数据格式被包裹在windows._shareData中,将json文件进行分析,发现图片的url就在nodes数据中。继续查阅资料,找到了一个url,可以获取json格式的页面数据。
这个爬虫主要是通过以下url从instagram获取json格式的信息,从中解析出我们需要的信息并保存
先通过 2 获取指定标签下的搜索结果,再由此搜索结果去到 3 查看这条博的详情(发布的图片信息),由 3 获取发帖人用户名去到 1 ,获取发帖人个人基本信息。
爬取的信息为(用户名,用户简介,已发帖数,粉丝数,关注的人数,发布时间,图片url,内容,评论数,点赞数)
将结果存入.txt 文件、 .csv文件、 mysql数据库。
需要的库有:
import requests
import json
import urllib
import io
import os
import http.cookiejar
import re
import time
import csv
import codecs
import pymysql
可通过修改search的值去爬取特定关键字的信息
search = "networksecurity" #search里边是搜索内容
github项目主页: https://github.com/SDWDD/Ins-Crawler
运行时务必改成自己数据库的用户名和密码!
#######################################
# 添加数据库
db = pymysql.connect(host="localhost", port=3306, user="自己数据库的用户名", password="自己数据库的密码", db="Instagram", charset="utf8")
在coding过程中参照了以下代码: