转载注明出处
class twitter_post(Document):
_id = ObjectIdField(primary_key = True)
screen_name = StringField(max_length = 128)
text = StringField(required = True, max_length = 2048)
text_id = IntField(required = True)
created_at = DateTimeField(required = True)
in_reply_to_screen_name = StringField(max_length = 64)
retweet_count = IntField()
favorite_count = IntField()
source = StringField(max_length = 1024)
longitude = StringField(max_length = 32)
latitude = StringField(max_length = 32)
location = StringField(max_length = 256)
country_code = StringField(max_length = 64)
lang = StringField(max_length = 4)
time_zone = StringField(max_length = 64)
province = StringField(max_length = 64)
city = StringField(max_length = 64)
district = StringField(max_length = 64)
street = StringField(max_length = 64)
street_number = StringField(max_length = 64)
meta = {
'ordering': ['created_at','screen_name'],
'collection': 'twitter_posts'
}
import requests
def GetAddress(lon,lat):
url = 'http://api.map.baidu.com/geocoder/v2/'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
payload = { 'output':'json', 'ak':'pAjezQsQBe8v1c1Lel87r4vprwXiGCEn' }
payload['location'] = '{0:s},{1:s}'.format(str(lon),str(lat))
print(lon,lat)
content = requests.get(url,params=payload,headers=header).json()
try:
content = requests.get(url,params=payload,headers=header).json()
content = content['result']['addressComponent']
if content['street'] == None:#有一些地理位置街道信息拿不到
content['street'] = 'NULL'
if content['street_number'] == None:
content['street_number'] = 'NULL'
except:
content["province"]="NULL"
content["city"]="NULL"
content["district"]="NULL"
content["street"]="NULL"
content["street_number"]="NULL"
return content
print(GetAddress(40.07571952, 116.60609467))
consumer_key = 'I1XowkiAc72fEp2CXPv0'
consumer_secret = 'drfnZHVUQrq1dyeqepCrbKyGWeYJCeTFQZpkLcXkgKFw3P'
access_key = '936432882482143235-jNLGPsCpZaSqR1D2WarSEshgQcyi'
access_secret = 'YF4ddleSgGxj8BsfmH2DELr7TsNNKAp08ZvqC'
# consumer_key = 'qEgHKHnL55g7k4U9xih'
# consumer_secret= 'QcUDHJS04wK5hrmlxV5C4gweiRPDca9JQoc4gp7ft'
# access_key= '863573499436122112-LA60oJLBzwVnhZjGOUPzRsJc'
# access_secret= '8CKFpp6qyxkAk1KfjWJPoHKloppPrvd7Tjiwllyk'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth_handler=auth, parser=JSONParser(), proxy = '127.0.0.1:1080', wait_on_rate_limit=True)
conn = connect('ANXIETY', alias='default', host='localhost', port=27017, username='', password='')#连接本地mongoDB
由于tweepy只提供过去一周的数据,而且每执行一次api.search()接口只会最多返回100条数据,而tweepy官方生成最多可以连续请求450次左右,因此我们大概最多可以拿到4万多数据,为了拿到尽可能多的数据,我们设置MAX_QUERIES进行多次查询,,本程序只爬取北京,上海,港澳台数据,爬取的主题在line.csv文件中:
regions = ["beijing","shanghai","hongkong","macau","taiwan"]
for line in open("normal_user.csv"):
#try:
#u = api.get_user(line)
#ms = myStream.filter(track=[line])
#print(results[0])
for r in regions:#对每一个地理位置
places = api.geo_search(query=r)["result"]["places"][0]#首先获取地理位置ID
print(places)
place_id = places["id"]
tweet_id = []
i = MAX_QUERIES
MAX_ID = 10
while i > 0:
if MAX_ID == 10:
#tweets = api.search(q="place:%s AND line" % place_id)#根据地理位置和关键词同时过去爬取
for tweet in api.search(q="place:%s" % place_id,count = 100, until=until)["statuses"]:#把截止日期放到七日后同时设置没次爬取最多数目一百,保证数据量
#print(tweet)
#j.write(json.dumps(tweet)+'\n')
tweet_id.append(tweet["id"])
Obj_id = ObjectId()
tweet_item = twitter_post(#生成一条mongo中的数据
_id = Obj_id,
text_id = tweet["id"],
created_at = datetime.datetime.strptime(tweet["created_at"], GMT_FORMAT),
screen_name = tweet["user"]["screen_name"],
favorite_count = tweet["favorite_count"],
retweet_count = tweet["retweet_count"],
text = tweet["text"],
source = tweet["source"],
country_code = (tweet["place"]["country_code"] if tweet['place'] != None else 'NULL'),
location = tweet["user"]["location"],
latitude = str(tweet["coordinates"]["coordinates"][0] if tweet["coordinates"] != None else 'NULL'),#根据返回的json文件拿到维度,注意返回时纬度在前,但是访问百度接口时,经度在前
longitude = str(tweet["coordinates"]["coordinates"][1] if tweet["coordinates"] != None else 'NULL'),
time_zone = tweet["user"]["time_zone"],
lang = tweet["lang"],
province = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['province'] if tweet["coordinates"] != None else 'NULL'),
city = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['city'] if tweet["coordinates"] != None else 'NULL'),
district = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['district'] if tweet["coordinates"] != None else 'NULL'),
street = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['street'] if tweet["coordinates"] != None else 'NULL'),
street_number = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['street_number'] if tweet["coordinates"] != None else 'NULL')
)
try:
tweet_item.save()#存入数据库
except:
continue
MAX_ID = min(tweet_id)
#print(MAX_ID)
else:
for tweet in api.search(q="place:%s",count = 100, max_id = MAX_ID-1)["statuses"]:
#print(tweet)
#j.write(json.dumps(tweet)+'\n')
tweet_id.append(tweet["id"])
Obj_id = ObjectId()
tweet_item = twitter_post(
_id = Obj_id,
text_id = tweet["id"],
created_at = datetime.datetime.strptime(tweet["created_at"], GMT_FORMAT),
screen_name = tweet["user"]["screen_name"],
favorite_count = tweet["favorite_count"],
retweet_count = tweet["retweet_count"],
text = tweet["text"],
source = tweet["source"],
country_code = (tweet["place"]["country_code"] if tweet['place'] != None else 'NULL'),
location = tweet["user"]["location"],
latitude = str(tweet["coordinates"]["coordinates"][0] if tweet["coordinates"] != None else 'NULL'),
longitude = str(tweet["coordinates"]["coordinates"][1] if tweet["coordinates"] != None else 'NULL'),
time_zone = tweet["user"]["time_zone"],
lang = tweet["lang"],
province = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['province'] if tweet["coordinates"] != None else 'NULL'),
city = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['city'] if tweet["coordinates"] != None else 'NULL'),
district = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['district'] if tweet["coordinates"] != None else 'NULL'),
street = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['street'] if tweet["coordinates"] != None else 'NULL'),
street_number = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['street_number'] if tweet["coordinates"] != None else 'NULL')
)
try:
tweet_item.save()
except:
continue
MAX_ID = min(tweet_id)
#print(MAX_ID)
i -= 1
(难点,中英文文本同时处理)
思路:根据字段lang将不同语言的文本分别处理生成词云,在进行排序合并
class MongoConn():
def __init__(self, db_name):
try:
url = '127.0.0.1:27017'
self.client = pymongo.MongoClient(url, connect=True)
self.db = self.client[db_name]
except Exception as e:
print ('连接mongo数据失败!')
traceback.print_exc()
def destroy(self):
self.client.close()
def getDb(self):
return self.db
def __del__(self):
self.client.close()
def docs_preprocessor(docs):
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
docs[idx] = docs[idx].lower() # Convert to lowercase.
docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.
# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isdigit()] for doc in docs]
# Remove words that are only one character.
docs = [[token for token in doc if token not in _STOP_WORDS] for doc in docs]
# Lemmatize all words in documents.
return docs
def nltk_tokenize(self,text):
tokens = []
# pos_tokens = []
# entities = []
features = []
stop_words = stop.load_stopwords()
try:
#分词,去空格
# tokens = text.split() #英语
tokens_cut = jieba.cut(text)
for word in tokens_cut:
#如果不是停止词并且长度大于1小于5而且不是数字,在特征中加上这个单词
if word not in stop_words and len(word) > 1 and len(word) < 5 and not is_number(word):
#features.append(word + "." + postag)
features.append(word)
for word in tokens_cut:
tokens.append(word)
# print 'feature here ', features
except: pass
return features
class cloudProducer():
def __init__(self):
self.mon = MongoConn('ANXIETY')
self.db = self.mon.getDb()
def getMainData(self, region_type, region):
#取最近一周的数据
en_docs = []
ch_docs = []
twitter_in_english = self.db.twitter_posts.find({region_type:region,"lang":"en"})
twitter_in_chinese = self.db.twitter_posts.find({region_type:region,"lang":"zh"})
for x in twitter_in_english:
#print(x)
en_docs.append(x["text"])
print(len(en_docs))
for x in twitter_in_chinese:
ch_docs.append(x["text"])
return [en_docs,ch_docs]
def produce_en_Cloud(self,region_type,region,num):
#main page
docs = self.getMainData(region_type,region)[0]
print(len(docs))
words_dump = []
docs = docs_preprocessor(docs)
for text in docs:
#print(text)
#features = text
#print(features)
words_dump = words_dump + text
cloud = collections.Counter(words_dump).most_common(num)
print(cloud)
#json.dump(cloud,open("wordCloud.json","w",encoding="utf-8"))
return cloud
def produce_ch_Cloud(self,region_type,region,num):
#main page
docs = self.getMainData(region_type,region)[1]
print(len(docs))
words_dump = []
for text in docs:
features = nltk_tokenize(text)
#print(features)
words_dump = words_dump + features
cloud = collections.Counter(words_dump).most_common(num)#返回一个元组数组
print(cloud)
#json.dump(cloud,open("wordCloud.json","w",encoding="utf-8"))
return cloud
def produce_cloud(self,region_type,region,num):
en_cloud = self.produce_en_Cloud(region_type,region,15)
ch_cloud = self.produce_ch_Cloud(region_type,region,15)
cloud = en_cloud + ch_cloud
cloud = sorted(cloud,key=lambda t: t[1],reverse=True)#中英文词云排序
return cloud[0:15]
cp = cloudProducer()
cloud = cp.produce_cloud("province","北京市",15)
print(cloud)