感谢每一个认真阅读我文章的人,看着粉丝一路的上涨和关注,礼尚往来总是要有的:
① 2000多本Python电子书(主流和经典的书籍应该都有了)
② Python标准库资料(最全中文版)
③ 项目源码(四五十个有趣且经典的练手项目及源码)
④ Python基础入门、爬虫、web开发、大数据分析方面的视频(适合小白学习)
⑤ Python学习路线图(告别不入流的学习)
网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。
需要这份系统化学习资料的朋友,可以戳这里获取
一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!
data = json.loads(r.text)
max_behot_time = data[‘next’][‘max_behot_time’]
if max_behot_time:
article_list = data[‘data’]
for i in article_list:
try:
if i[‘article_genre’] == ‘article’:
res = requests.get(‘https://www.toutiao.com/i’ + i[‘group_id’], headers=headers(),
cookies=cookies)
article_title = re.findall(“title: ‘(.*?)’”, res.text)
article_content = re.findall(“content: ‘(.*?)’”, res.text, re.S)[0]
article_content = article_content.replace(‘"’, ‘’).replace(‘u003C’, ‘<’).replace(
‘u003E’,
‘>’).replace(
‘=’,
‘=’).replace(
‘u002F’, ‘/’).replace(‘\’, ‘’)
article_images = etree.HTML(article_content)
article_image = article_images.xpath(‘//img/@src’)
article_time = re.findall(“time: ‘(.*?)’”, res.text)
article_source = re.findall(“source: ‘(.*?)’”, res.text, re.S)
result_time = []
[result_time.append(i) for i in
str(article_time[0]).split(’ ‘)[0].replace(’-‘, ‘,’).split(’,')]
cha = (datetime.now() - datetime(int(result_time[0]), int(result_time[1]),
int(result_time[2]))).days
if 30 < cha <= 32:
continue
if cha > 32:
print(‘完成’)
break_flag.append(1)
break
row = {‘发表时间’: article_time[0], ‘标题’: article_title[0].strip(‘"’),
‘来源’: article_source[0],‘所有图片’:article_image,
‘文章内容’: article_content.strip()}
with open(‘/toutiao/’ + str(csv_name) + ‘文章.csv’, ‘a’, newline=‘’, encoding=‘gb18030’)as f:
f_csv = csv.DictWriter(f, headers1)
f_csv.writerow(row)
print(‘正在爬取文章:’, article_title[0].strip(‘"’), article_time[0],
‘https://www.toutiao.com/i’ + i[‘group_id’])
time.sleep(1)
else:
pass
except Exception as e:
print(e, ‘https://www.toutiao.com/i’ + i[‘group_id’])
wenzhang(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n)
else:
pass
except KeyError:
n += 1
print(‘第’ + str(n) + ‘次请求’, first_url)
time.sleep(1)
if n == max_qingqiu:
print(‘请求超过最大次数’)
break_flag.append(1)
else:
pass
except Exception as e:
print(e)
else:
pass
def get_wenzhang_detail(url, csv_name=0):
headers1 = [‘发表时间’, ‘标题’, ‘来源’, ‘文章内容’]
res = requests.get(url, headers=headers_a, cookies=cookies)
article_title = re.findall(“title: ‘(.*?)’”, res.text)
article_content = re.findall(“content: ‘(.*?)’”, res.text, re.S)
pattern = re.compile(r"[(a-zA-Z~-_!@#$%^+*&\/?|:.<>{}()';=)*|\d]")
article_content = re.sub(pattern, ‘’, article_content[0])
article_time = re.findall(“time: ‘(.*?)’”, res.text)
article_source = re.findall(“source: ‘(.*?)’”, res.text, re.S)
result_time = []
[result_time.append(i) for i in str(article_time[0]).split(’ ‘)[0].replace(’-‘, ‘,’).split(’,')]
cha = (datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days
if cha > 8:
return None
row = {‘发表时间’: article_time[0], ‘标题’: article_title[0].strip(‘"’), ‘来源’: article_source[0],
‘文章内容’: article_content.strip()}
with open(‘/toutiao/’ + str(csv_name) + ‘文章.csv’, ‘a’, newline=‘’)as f:
f_csv = csv.DictWriter(f, headers1)
f_csv.writerow(row)
print(‘正在爬取文章:’, article_title[0].strip(‘"’), article_time[0], url)
time.sleep(0.5)
return ‘ok’
break_flag_video = []
def shipin(url, max_behot_time=0, csv_name=0, n=0):
max_qingqiu = 20
headers2 = [‘视频发表时间’, ‘标题’, ‘来源’, ‘视频链接’]
first_url = ‘https://www.toutiao.com/c/user/article/?page_type=0&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s’ % (
url.split(‘/’)[-2], max_behot_time, get_as_cp()[‘as’], get_as_cp()[‘cp’],
get_signature(url.split(‘/’)[-2], max_behot_time))
while n < max_qingqiu and not break_flag_video:
try:
res = requests.get(first_url, headers=headers_a, cookies=cookies)
data = json.loads(res.text)
max_behot_time = data[‘next’][‘max_behot_time’]
if max_behot_time:
video_list = data[‘data’]
for i in video_list:
try:
start_time = i[‘behot_time’]
video_title = i[‘title’]
video_source = i[‘source’]
detail_url = ‘https://www.ixigua.com/i’ + i[‘item_id’]
resp = requests.get(detail_url, headers=headers())
r = str(random.random())[2:]
url_part = “/video/urls/v/1/toutiao/mp4/{}?r={}”.format(
re.findall(‘“video_id”:“(.*?)”’, resp.text)[0], r)
s = crc32(url_part.encode())
api_url = “https://ib.365yg.com{}&s={}”.format(url_part, s)
resp = requests.get(api_url, headers=headers())
j_resp = resp.json()
video_url = j_resp[‘data’][‘video_list’][‘video_1’][‘main_url’]
video_url = b64decode(video_url.encode()).decode()
if 30 < (int(str(time.time()).split(‘.’)[0]) - start_time) / 86400 <= 32:
continue
if (int(str(time.time()).split(‘.’)[0]) - start_time) / 86400 > 32:
print(‘完成’)
break_flag_video.append(1)
break
row = {‘视频发表时间’: time.strftime(‘%Y-%m-%d %H:%M:%S’, time.localtime(start_time)),
‘标题’: video_title, ‘来源’: video_source,
‘视频链接’: video_url}
with open(‘/toutiao/’ + str(csv_name) + ‘视频.csv’, ‘a’, newline=‘’, encoding=‘gb18030’)as f:
f_csv = csv.DictWriter(f, headers2)
f_csv.writerow(row)
print(‘正在爬取视频:’, video_title, detail_url, video_url)
time.sleep(3)
except Exception as e:
print(e, ‘https://www.ixigua.com/i’ + i[‘item_id’])
shipin(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n)
except KeyError:
n += 1
print(‘第’ + str(n) + ‘次请求’, first_url)
time.sleep(3)
if n == max_qingqiu:
print(‘请求超过最大次数’)
break_flag_video.append(1)
except Exception as e:
print(e)
else:
pass
break_flag_weitoutiao = []
def weitoutiao(url, max_behot_time=0, n=0, csv_name=0):
max_qingqiu = 20
headers3 = [‘微头条发表时间’, ‘来源’, ‘标题’, ‘文章内图片’, ‘微头条内容’]
while n < max_qingqiu and not break_flag_weitoutiao:
try:
first_url = ‘https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=%s&max_behot_time=%s’ % (
url.split(‘/’)[-2], max_behot_time)
res = requests.get(first_url, headers=headers_a, cookies=cookies)
data = json.loads(res.text)
max_behot_time = data[‘next’][‘max_behot_time’]
weitoutiao_list = data[‘data’]
for i in weitoutiao_list:
try:
detail_url = ‘https://www.toutiao.com/a’ + str(i[‘concern_talk_cell’][‘id’])
resp = requests.get(detail_url, headers=headers(), cookies=cookies)
start_time = re.findall(“time: ‘(.*?)’”, resp.text, re.S)
weitoutiao_name = re.findall(“name: ‘(.*?)’”, resp.text, re.S)
weitoutiao_title = re.findall(“title: ‘(.*?)’”, resp.text, re.S)
weitoutiao_images = re.findall(‘images: [“(.*?)”]’,resp.text,re.S)
if weitoutiao_images:
weitoutiao_image = ‘http:’ + weitoutiao_images[0].replace(‘u002F’,‘/’).replace(‘\’,‘’)
else:
weitoutiao_image = ‘此头条内无附件图片’
weitoutiao_content = re.findall(“content: ‘(.*?)’”, resp.text, re.S)
result_time = []
[result_time.append(i) for i in str(start_time[0]).split(’ ‘)[0].replace(’-‘, ‘,’).split(’,')]
cha = (
datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days
if cha > 30:
break_flag_weitoutiao.append(1)
print(‘完成’)
break
row = {‘微头条发表时间’: start_time[0], ‘来源’: weitoutiao_name[0],
‘标题’: weitoutiao_title[0].strip(‘"’),‘文章内图片’: weitoutiao_image,
‘微头条内容’: weitoutiao_content[0].strip(‘"’)}
with open(‘/toutiao/’ + str(csv_name) + ‘微头条.csv’, ‘a’, newline=‘’, encoding=‘gb18030’)as f:
f_csv = csv.DictWriter(f, headers3)
f_csv.writerow(row)
time.sleep(1)
print(‘正在爬取微头条’, weitoutiao_name[0], start_time[0], detail_url)
except Exception as e:
print(e, ‘https://www.toutiao.com/a’ + str(i[‘concern_talk_cell’][‘id’]))
weitoutiao(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n)
except KeyError:
n += 1
print(‘第’ + str(n) + ‘次请求’)
time.sleep(2)
if n == max_qingqiu:
print(‘请求超过最大次数’)
break_flag_weitoutiao.append(1)
else:
pass
except Exception as e:
print(e)
else:
pass
def csv_read(path):
data = []
with open(path, ‘r’, encoding=‘gb18030’) as f:
reader = csv.reader(f, dialect=‘excel’)
for row in reader:
data.append(row)
return data
def main():
for j, i in enumerate(csv_read(‘toutiao-suoyou.csv’)):
if ‘文章’ in i[3]:
print(‘当前正在抓取文章第’, j, i[2])
headers1 = [‘发表时间’, ‘标题’, ‘来源’, ‘所有图片’, ‘文章内容’]
with open(‘/toutiao/’ + i[0] + ‘文章.csv’, ‘a’, newline=‘’)as f:
f_csv = csv.DictWriter(f, headers1)
f_csv.writeheader()
break_flag.clear()
wenzhang(url=i[2], csv_name=i[0])
if ‘视频’ in i[3]:
print(‘当前正在抓取视频第’, j, i[2])
headers2 = [‘视频发表时间’, ‘标题’, ‘来源’, ‘视频链接’]
with open(‘/toutiao/’ + i[0] + ‘视频.csv’, ‘a’, newline=‘’)as f:
f_csv = csv.DictWriter(f, headers2)
f_csv.writeheader()
break_flag_video.clear()
shipin(url=i[2], csv_name=i[0])
if ‘微头条’ in i[3]:
headers3 = [‘微头条发表时间’, ‘来源’, ‘标题’, ‘文章内图片’, ‘微头条内容’]
print(‘当前正在抓取微头条第’, j, i[2])
with open(‘/toutiao/’ + i[0] + ‘微头条.csv’, ‘a’, newline=‘’)as f:
f_csv = csv.DictWriter(f, headers3)
f_csv.writeheader()
break_flag_weitoutiao.clear()
weitoutiao(url=i[2], csv_name=i[0])
def get_all(urlQueue):
while True:
try:
data_url = urlQueue.get_nowait()
except Exception as e:
break
if ‘视频’ in data_url[3]:
print(‘当前正在抓取视频’, data_url[2])
headers2 = [‘视频发表时间’, ‘标题’, ‘来源’, ‘视频链接’]
with open(‘/toutiao/’ + data_url[0] + ‘视频.csv’, ‘a’, newline=‘’)as f:
f_csv = csv.DictWriter(f, headers2)
f_csv.writeheader()
break_flag_video.clear()
shipin(url=data_url[2], csv_name=data_url[0])
学好 Python 不论是就业还是做副业赚钱都不错,但要学会 Python 还是要有一个学习规划。最后大家分享一份全套的 Python 学习资料,给那些想学习 Python 的小伙伴们一点帮助!
Python所有方向路线就是把Python常用的技术点做整理,形成各个领域的知识点汇总,它的用处就在于,你可以按照上面的知识点去找对应的学习资源,保证自己学得较为全面。
工欲善其事必先利其器。学习Python常用的开发软件都在这里了,给大家节省了很多时间。
书籍的好处就在于权威和体系健全,刚开始学习的时候你可以只看视频或者听某个人讲课,但等你学完之后,你觉得你掌握了,这时候建议还是得去看一下书籍,看权威技术书籍也是每个程序员必经之路。
我们在看视频学习的时候,不能光动眼动脑不动手,比较科学的学习方法是在理解之后运用它们,这时候练手项目就很适合了。
光学理论是没用的,要学会跟着一起敲,要动手实操,才能将自己的所学运用到实际当中去,这时候可以搞点实战案例来学习。
我们学习Python必然是为了找到高薪的工作,下面这些面试题是来自阿里、腾讯、字节等一线互联网大厂最新的面试资料,并且有阿里大佬给出了权威的解答,刷完这一套面试资料相信大家都能找到满意的工作。
网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。
需要这份系统化学习资料的朋友,可以戳这里获取
一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!