Python 学习记录4

import requests,re,time,urllib,os,random
from urllib import request
from bs4 import BeautifulSoup

headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
cookies = {"cookie": "_T_WM"}
class Weibo(object):
    def __init__(self,url):
        self.url = url
        self.dir = "C:\\Users\\Desktop\\Python\\Weibo"
    #input any url
    def getType(self):
        url_split = self.url.split('/')
        user = url_split[3]
        if user == str():
            flag = 1
        else:
            flag = 0
        return flag
    # input the first page url
    def getPage(self):
        Html = requests.get(self.url,cookies=cookies,headers=headers).text
        pagereg = r'value="(\d+)"'
        pages = re.compile(pagereg).findall(Html)
        if len(pages) == 0:
            page = 1
        else:
            page = pages[-1]
        return page
    # input any url
    def getUrl(self):
        Html = requests.get(self.url,cookies=cookies,headers=headers).text
        Soup = BeautifulSoup(Html,'lxml')
        return Html,Soup
    # Input the first page url
    def getBasicInfo(self):
        OneHtml = requests.get(self.url,cookies=cookies,headers=headers).text
        OneSoup = BeautifulSoup(OneHtml,'lxml')
        ID_reg = r'粉丝\[(\d+)\]'
        name_reg = r'(.+?)的微博'
        ID = re.compile(ID_reg).findall(OneHtml)[0]
        fans = re.compile(fans_reg).findall(OneHtml)[0]
        name = re.compile(name_reg).findall(OneHtml)[0]
        people_dir = self.dir + '\\' + str(name)
        if not os.path.isdir(people_dir):
            os.mkdir(people_dir)
        info_url = "https://weibo.cn/" + str(ID) + "/" + "info"
        return name,fans,info_url,people_dir
    # Input the info page url
    def getDetailInfo(self):
        InfoHtml = requests.get(self.url,cookies=cookies,headers=headers).text
        # 学校
        xx = r'
学习经历
(.+?)
' xuexiao = re.compile(xx).findall(InfoHtml) if len(xuexiao) != 0: info_school = "学校" + ':' + xuexiao[0] + '\n' else: info_school = "学校:Missing" + '\n' # 性别 xb = r'
性别:(.+?)
' xingbie = re.compile(xb).findall(InfoHtml) if len(xingbie) != 0: info_xb = "性别" + ':' + str(xingbie[0]) + '\n' else: info_xb = "性别:Missing" # 地区 dq = r'
地区:(.+?)
' diqu = re.compile(dq).findall(InfoHtml) if len(diqu) != 0: info_dq = "地区" + ':' + str(diqu[0]) + '\n' else: info_dq = "地区:Missing" # 生日 sr = r'
生日:(.+?)
' shengri = re.compile(sr).findall(InfoHtml) if len(shengri) != 0: info_sr = "生日" + ':' + str(shengri[0]) + '\n' else: info_sr = "生日:Missing" + '\n' # 简介 jjie = r'
简介:(.+?)
' jianjie = re.compile(jjie).findall(InfoHtml) if len(jianjie) != 0: info_jjie = "简介" + ':' + str(jianjie[0]) + '\n' else: info_jjie = "简介:Missing" + '\n' return info_school,info_xb,info_dq,info_sr,info_jjie def one(html,dir): s = r'src="(.+?)wap180/.+?"/>' e = r'src=".+?/wap180/(.+?)"/>' ss = re.compile(s).findall(html)[0] ee = re.compile(e).findall(html)[0] url = ss + "large/" + ee print(url) curdir = dir + '\\' urllib.request.urlretrieve(url, '{}{}.jpg'.format(curdir, ee)) def group(html,dir): reg = r'<(a href=".+?">.+?)' regre = re.compile(reg) lists = regre.findall(html) for i in lists: if u'组图' in i: ureg = r'a href="(https.+?)">' uregre = re.compile(ureg) gro_url = uregre.findall(i)[0] print(gro_url) Group = Weibo(gro_url) html,soup = Group.getUrl() img = r'img src="(http.+?)".+?原图' imgre = re.compile(img) imgurl = imgre.findall(html) #print("imgurl",imgurl) for u in imgurl: u = str(u) s = r'^(.+?)thumb180/.+?' e = r'.+?/thumb180/(.+?)$' ss = re.compile(s).findall(u)[0] ee = re.compile(e).findall(u)[0] uu = ss + "large" + '/' + ee print(uu) curdir = dir + '\\' urllib.request.urlretrieve(uu, '{}{}'.format(curdir, ee)) time.sleep(1) time.sleep(1) def getInfo(url): basic = Weibo(url) page = basic.getPage() name,fans,info_url,people_dir = basic.getBasicInfo() detail = Weibo(info_url) xx,xb,dq,sr,jjie = detail.getDetailInfo() file = people_dir + '\\' + name + ".txt" fo = open(file,'w',encoding=('utf-8')) fo.write("昵称:" + name + '\n');fo.write(xb);fo.write(sr) fo.write("粉丝:" + fans + '\n');fo.write(xx);fo.write(dq) fo.write(jjie);fo.write("目录:" + people_dir + '\n') print(name + ":Info write done!") return page,people_dir def getLastWeiboTime(url): time_html,time_soup = Weibo(url).getUrl() wb_list = time_soup.find_all('div',class_="c") leng = len(wb_list) i = 0 time_list = [] for i in range(leng): weibo = str(wb_list[i]) #print(wb_list[i]) if u'置顶' not in weibo and u'赞' in weibo : reg = r'(.+?)<' real_time = re.compile(reg).findall(weibo)[0] time_list.append(real_time) print(time_list[0]) def getWeibo(ori): url = ori + "?page=" pages,dir = getInfo(url) for p in range(1,int(pages) + 1): cur_url = url + str(p) print("第" + str(p) + "页") try: Page = Weibo(cur_url) page_html,page_soup = Page.getUrl() wbs = page_soup.find_all('div',class_="c") for w in wbs: con = str(w) #print(con) if u'原图' in con and u'转发了' not in con and u'转发理由' not in con: #print(con) if u'组图' in con: #print(con) print("组图") group(con, dir) time.sleep(1) else: #print(con) print("单图") one(con, dir) time.sleep(1) except: time.sleep(1) continue print("Img downloads Done!") oris = [""] for ori in oris: getWeibo(ori) getLastWeiboTime(ori)

你可能感兴趣的:(Python 学习记录4)