面向Instagram信息采集的网络爬虫

实验要求

实验目的

爬取目标http://www.instagram.com,采集主题为"network security"的帖子信息

实验要求
  1. 采集内容:用户简介、发帖数、关注数、被关注数、发布的图片文件、发布时间、点赞数、评论数;
  2. 基本指标:
    (1)支持对可公开访问的特定Instagram账号(用户)发布的信息进行采集,采集的用户数量不少于100个;
    (2)采集的内容需存储在数据库系统中。

代码

isn.py
# -*- coding: utf-8 -*-
import requests
import json
import urllib
import re
import time
import csv
import codecs
import pymysql

s = requests.session()
s.headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
}

search = "networksecurity" #search里边是搜索内容
website = "http://www.instagram.com"

q = urllib.parse.quote(search)

print(q)
url1 = website+"/explore/tags/"+q+"/?__a=1"
requests.adapters.DEFAULT_RETRIES = 5

html = s.get(url1)

ans = json.loads(html.text)

pgn = 0

########################################
# 添加数据库
db = pymysql.connect(host="localhost", port=3306, user="mysql账户名", password="mysql登录密码", db="Instagram", charset="utf8")
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS Info")
createTable = """CREATE TABLE Info(
  Username VARCHAR (50) NOT NULL ,
  PostsNumber VARCHAR (8) NOT NULL ,
  FansNumber VARCHAR (8) NOT NULL ,
  AttentionNumber VARCHAR (8) NOT NULL ,
  Ptime VARCHAR (20) NOT NULL ,
  PicURL VARCHAR (500) NOT NULL ,
  CommentNumber VARCHAR (8) NOT NULL ,
  LikeNumber VARCHAR (8) NOT NULL) ENGINE MyISAM DEFAULT CHARSET=utf8"""
cursor.execute(createTable)

f = open("./Save/"+str(search)+".txt","w",encoding='utf-8')
csvfile = codecs.open("./Save/"+str(search)+".csv", 'wb',encoding='gb18030')

result = [] 

########################################
writer = csv.writer(csvfile)
data=['用户名', '用户简介', '已发帖', '粉丝数', '关注的人数', '发布时间', '图片url', '内容', '评论数', '点赞数']
writer.writerow(data)

#
#
#
edges = ans['graphql']['hashtag']['edge_hashtag_to_top_posts']['edges']
n = 0
for i in range (len(edges)): 
  temp_dict = {}  
  if len(edges[i]['node']['edge_media_to_caption']['edges']) == 0:
    continue
  d = edges[i]['node']['edge_media_to_caption']['edges'][0]['node']['text']
  shortcode = edges[i]['node']['shortcode']
  url2 = website+"/p/"+shortcode+"/?__a=1"
  getnt = s.get(url2, verify=False)
  getnt = json.loads(getnt.text)

  username = getnt['graphql']['shortcode_media']['owner']['username']
  ptime = getnt['graphql']['shortcode_media']['taken_at_timestamp']
  commentNumber = getnt['graphql']['shortcode_media']['edge_media_to_parent_comment']['count']
  likeNumber = getnt['graphql']['shortcode_media']['edge_media_preview_like']['count']
  picurl = getnt['graphql']['shortcode_media']['display_url']
  ptime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ptime))
  ##############################test
  userurl = website + '/' +username + "/?__a=1"
  print("userurl:", userurl)
  userhtml = s.get(userurl)
  userhtml = userhtml.text
  shared_data = userhtml.split("window._sharedData = ")[0].split(";")[0]
  shared_data = json.loads(shared_data)
  #print(shared_data)
  postsNumber = shared_data['graphql']['user']['edge_owner_to_timeline_media']['count'] #用户发布的帖子数
  fansNumber = shared_data['graphql']['user']['edge_followed_by']['count'] #用户粉丝数
  attentionNumber = shared_data['graphql']['user']['edge_follow']['count'] #用户关注的人数
  biography = shared_data['graphql']['user']['biography'] #用户简介
  ###################
  print (username)
  print (ptime)
  print (d)
  print("comment Number:" + str(commentNumber))
  print("like number:",getnt['graphql']['shortcode_media']['edge_media_preview_like']['count'])
  print("picture url:" + picurl)
  data = [username,ptime,re.sub(r'\s+',' ', d)]
  writer.writerow(data)
  temp_dict['author'] = username  
  temp_dict['date'] = ptime  
  temp_dict['comment'] = re.sub(r'\s+',' ', d) 
  result.append(temp_dict) 
  f.writelines("用户名:")
  f.writelines(username)
  f.writelines('\n')
  f.writelines("用户简介:")
  f.writelines(biography)
  f.writelines('\n')
  f.writelines("已发帖:")
  postsNum = str(postsNumber)
  f.writelines(postsNum)
  f.writelines('\n')
  f.writelines("粉丝数:")
  fansNum = str(fansNumber)
  f.writelines(fansNum)
  f.writelines('\n')
  f.writelines("关注的人数:")
  attentionNum = str(attentionNumber)
  f.writelines(attentionNum)
  f.writelines('\n')
  f.writelines("发布时间:")
  f.writelines(ptime)
  f.writelines('\n')
  f.writelines("图片url:")
  picU = str(picurl)
  f.writelines(picU)
  f.writelines('\n')
  f.writelines("内容:")
  content = re.sub(r'\s+', ' ', d)
  f.writelines(content)
  f.writelines('\n')
  f.writelines("评论数:")
  commentNum = str(commentNumber)
  f.writelines(commentNum)
  f.writelines('\n')
  f.writelines("点赞数:")
  likeNum = str(likeNumber)
  f.writelines(likeNum)
  f.writelines('\n')
  f.writelines('\n')
  data = [username, biography, postsNumber, fansNumber, attentionNumber, ptime, picurl,
          re.sub(r'\s+', ' ', d), commentNumber, likeNumber]
  writer.writerow(data)
  sql = """INSERT INTO Info (Username, PostsNumber, FansNumber, AttentionNumber, Ptime, PicURL, CommentNumber, LikeNumber) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""
  val = (username, postsNum, fansNum, attentionNum, ptime, picU, commentNum, likeNum)
  cursor.execute(sql, val)
  db.commit()
  print("写入数据库成功!")
  #except:
   # db.rollback()

b = ans['graphql']['hashtag']["edge_hashtag_to_media"]
hnp = b['page_info']['has_next_page']
hashn = b['page_info']['end_cursor']
print (hnp,hashn)
#                                     __                     

# 下面的设置为  pgn != -1时,将抓取所有结果,可能抓取到的结果十分庞大
while hnp == True and pgn != 300:
    pgn = pgn + 1 
    url1 = website+"/graphql/query/?query_hash=298b92c8d7cad703f7565aa892ede943&variables=%7B%22tag_name%22%3A%22"+q+"%22%2C%22first%22%3A6%2C%22after%22%3A%22"+hashn+"%22%7D"
    print (url1)
    html = s.get(url1, verify=False)
    try:
      ans = json.loads(html.text)
    except:
      v = open("bug.txt","w")
      v.writelines(html.text)
      v.close()
      print ("ERROR")
      url1 = website+"/graphql/query/?query_hash=298b92c8d7cad703f7565aa892ede943&variables=%7B%22tag_name%22%3A%22"+q+"%22%2C%22first%22%3A6%2C%22after%22%3A%22"+hashn+"%22%7D"
      # print (url1)
      html = s.get(url1, verify=False)
      ans = json.loads(html.text)
      # continue;

    try:
      edges = ans['data']['hashtag']['edge_hashtag_to_media']['edges']
    except:
      v = open("bug.txt","w")
      v.writelines(html.text)
      v.close()
      print ("ERROR")
      url1 = website+"/graphql/query/?query_hash=298b92c8d7cad703f7565aa892ede943&variables=%7B%22tag_name%22%3A%22"+q+"%22%2C%22first%22%3A6%2C%22after%22%3A%22"+hashn+"%22%7D"
      # print (url1)
      html = s.get(url1, verify=False)
      ans = json.loads(html.text)
      edges = ans['data']['hashtag']['edge_hashtag_to_media']['edges']

    for i in range (len(edges)):  
      temp_dict = {}
      # print ((len(edges))) 
      if len(edges[i]['node']['edge_media_to_caption']['edges']) == 0:
        continue
      d = edges[i]['node']['edge_media_to_caption']['edges'][0]['node']['text']
      shortcode = edges[i]['node']['shortcode']
      url2 = website+"/p/"+shortcode+"/?__a=1"
      getnt = s.get(url2, verify=False)
      try:
        getnt = json.loads(getnt.text)
      except:
        url2 = website+"/p/"+shortcode+"/?__a=1"
        getnt = s.get(url2, verify=False)
        getnt = json.loads(getnt.text)

      username = getnt['graphql']['shortcode_media']['owner']['username']
      ptime = getnt['graphql']['shortcode_media']['taken_at_timestamp']
      nd = re.sub(r'\s+', ' ', d)
      commentNumber = getnt['graphql']['shortcode_media']['edge_media_to_parent_comment']['count']
      likeNumber = getnt['graphql']['shortcode_media']['edge_media_preview_like']['count']
      picurl = getnt['graphql']['shortcode_media']['display_url']
      ptime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ptime))
      ##############################test
      userurl = website + '/' + username + "/?__a=1"
      print("userurl:", userurl)
      userhtml = s.get(userurl)
      userhtml = userhtml.text
      shared_data = userhtml.split("window._sharedData = ")[0].split(";")[0]
      shared_data = json.loads(shared_data)
      # print(shared_data)
      postsNumebr = shared_data['graphql']['user']['edge_owner_to_timeline_media']['count']  # 用户发布的帖子数
      fansNumber = shared_data['graphql']['user']['edge_followed_by']['count']  # 用户粉丝数
      attentionNumber = shared_data['graphql']['user']['edge_follow']['count']  # 用户关注的人数
      biography = shared_data['graphql']['user']['biography']  # 用户简介
      ###################

      f.writelines("用户:")
      f.writelines(username)
      f.writelines('\n')
      f.writelines("用户简介:")
      f.writelines(biography)
      f.writelines('\n')
      f.writelines("已发帖:")
      postsNum = str(postsNumebr)
      f.writelines(postsNum)
      f.writelines('\n')
      f.writelines("粉丝数:")
      fansNum = str(fansNumber)
      f.writelines(fansNum)
      f.writelines('\n')
      f.writelines("关注的人数:")
      attentionNum = str(attentionNumber)
      f.writelines(attentionNum)
      f.writelines('\n')
      f.writelines("发布时间:")
      f.writelines(ptime)
      f.writelines('\n')
      f.writelines("图片url:")
      picU = str(picurl)
      f.writelines(picU)
      f.writelines('\n')
      f.writelines("内容:")
      f.writelines(re.sub(r'\s+', ' ', d))
      f.writelines('\n')
      f.writelines("评论数:")
      commentNum = str(commentNumber)
      f.writelines(commentNum)
      f.writelines('\n')
      f.writelines("点赞数:")
      likeNum = str(likeNumber)
      f.writelines(likeNum)
      f.writelines('\n')
      f.writelines('\n')
      data = [username, biography, postsNumebr, fansNumber, attentionNumber, ptime, picurl,
              re.sub(r'\s+', ' ', d), commentNumber, likeNumber]
      writer.writerow(data)
      n = n + 1
      sql = "INSERT INTO Info (Username, PostsNumber, FansNumber, AttentionNumber, Ptime, PicURL, CommentNumber, LikeNumber) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
      val = (username, postsNum, fansNum, attentionNum, ptime, picU, commentNum, likeNum)
      cursor.execute(sql, val)
      db.commit()
      print("写入数据库成功!")
      #except:
       # db.rollback()

    b = ans['data']['hashtag']["edge_hashtag_to_media"]
    hnp = b['page_info']['has_next_page']
    hashn = b['page_info']['end_cursor'] 
    print (hnp,hashn,pgn,len(edges))
f.close()
csvfile.close()
db.close()

Readme


一个简单的ins爬虫,需要确保在开始之前开了全局代理哈 毕竟科学上网


目录

  • 背景介绍
  • 项目介绍
  • 使用说明
    • 获取代码
    • 注意事项
  • 参考说明

背景介绍

Instagram 是一款运行在移动端上的以图片为主的社交应用

项目介绍

请务必要在开始运行代码之前确定是全局代理

ins个人主页上的图片是通过js动态生成的,这就意味着我们没有办法通过requests简单的获得所有所需的URL,使用request模块get到的内容仅仅是网页框架和一堆js脚本,ins里的数据是通过Ajax异步请求过来的。在进行网页分析时发现图片的连接以json的数据格式被包裹在windows._shareData中,将json文件进行分析,发现图片的url就在nodes数据中。继续查阅资料,找到了一个url,可以获取json格式的页面数据。

这个爬虫主要是通过以下url从instagram获取json格式的信息,从中解析出我们需要的信息并保存

  1. www.instagram.com/username/?__a=1
  2. www.instagram.com/explore/tags/sometag/?__a=1
  3. www.instagram.com/p/someShortCode/?__a=1

先通过 2 获取指定标签下的搜索结果,再由此搜索结果去到 3 查看这条博的详情(发布的图片信息),由 3 获取发帖人用户名去到 1 ,获取发帖人个人基本信息。

爬取的信息为(用户名,用户简介,已发帖数,粉丝数,关注的人数,发布时间,图片url,内容,评论数,点赞数)

将结果存入.txt 文件、 .csv文件、 mysql数据库。

使用说明

需要的库有:

import requests
import json
import urllib
import io
import os
import http.cookiejar
import re
import time
import csv
import codecs
import pymysql

可通过修改search的值去爬取特定关键字的信息

search = "networksecurity" #search里边是搜索内容

获取代码

github项目主页: https://github.com/SDWDD/Ins-Crawler


注意事项

运行时务必改成自己数据库的用户名和密码!

#######################################
# 添加数据库
db = pymysql.connect(host="localhost", port=3306, user="自己数据库的用户名", password="自己数据库的密码", db="Instagram", charset="utf8")

参考说明

在coding过程中参照了以下代码:

  1. https://github.com/timgrossmann/instagram-profilecrawl
  2. https://github.com/anonymouslycn/Instagrambot
  3. https://github.com/huijay12/instagram-photo-crawler

你可能感兴趣的:(爬虫)