首先判断页面是静态页面还是动态页面
可以使用
resp = requests.get('url')
print(resp.text)
解析出页面代码判断是否是静态页面,
driver = webdriver.Chrome()
driver.get('url')
# page_source是带动态内容的页面源代码
soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
print(soup)
或者
寻找网页JSON,找到JSON之后解析JSON代码
resp = requests.get('url')
data_dict = resp.json()
print(data_dict)
resp = requests.get('url')
pattern = re.compile(r'正则表达式')
results = pattern.findall(resp.text)
resp = requests.get('url')
soup = bs4.BeautifulSoup.select('selector --> 标签层级方式获取')
for i in soup:
# 获取标签下的文字信息
print(i.text)
# 获取标签内属性内容
print(i.attrs['标签属性名'])
---------------------------------------------华丽的分割线---------------------------------------------
例:垃圾分类信息
word = input('请输入一个垃圾:')
resp = requests.get(
f'http://api.tianapi.com/txapi/lajifenlei/index?key={key}&word={word}'
)
new_list = resp.json()['newslist']
for news in new_list:
print(news['name'], news['type'], news['aipre'], news['explain'], news['contain'], news['tip'])
详情请借鉴《关于BOSS直聘动态Cookie的自动化获取》
https://blog.csdn.net/weixin_42788769/article/details/112341879
---------------------------------------------华丽的分割线---------------------------------------------
# 使用HTTP请求头之User-Agent模拟浏览器的信息进而骗过网站
resp = requests.get(
url='https://movie.douban.com/top250',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4375.0 Safari/537.36 Edg/89.0.756.0'
}
)
# 因为有的页面爬取的时候,需要登录后才能爬,比如知乎,如何判断一个页面是否已经登录,通过判断是否含有cookies就可以,我们获取到cookie后就可以携带cookie来访问需要登录后的页面了。
resp = requests.get(
url=detail_url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4375.0 Safari/537.36 Edg/89.0.756.0',
'Cookie': 'll="118318"; bid=aCAd756EKMw; _vwo_uuid_v2=DFB59E4F87AA4AA5871B1705DE6622267|1d87dac2bf747016b497357b2e3d3852; dbcl2="229867820:DeXvsQ5h5ZE"; ck=T5bR; push_noty_num=0; push_doumail_num=0'
}
)
# 模拟人访问网页时的停留时间
# 休眠时间可自行调整
time.sleep(random.random(3,5))
---------------------------------------------华丽的分割线---------------------------------------------
# 多线程的基本代码
with ThreadPoolExecutor(max_workers=16) as pool:
futures = []
f = pool.submit(download_picture, href)
futures.append(f)
for f in futures:
f.result()
---------------------------------------------华丽的分割线---------------------------------------------
Python对于Excel文件操作的便捷性
# 写CSV(逗号分隔值)文件
import csv
with open('test.csv', 'w', encoding='gbk') as file:
csv_writer = csv.writer(file, quoting=csv.QUOTE_ALL)
csv_writer.writerow(['关羽', 90, 80, 70])
csv_writer.writerow(['张飞', 75, 92, 50])
csv_writer.writerow(['赵云', 99, 98, 97])
csv_writer.writerow(['马超', 95, 82, 55])
csv_writer.writerow(['黄忠', 90, 80, 90])
# 读取CSV文件
import csv
with open('xxxx.csv', 'r') as file:
csv_reader = csv.reader(file)
for row in csv_reader:
print(row)
wb = openpyxl.Workbook()
# 创建工作簿名
sheet = wb.create_sheet('Top250')
# 表格的每一列的名
col_names = ('排名', '名称', '评分', '类型', '制片国家', '语言', '时长')
# 表格的每一列的名循环写进每一列
for index, name in enumerate(col_names):
sheet.cell(1, index + 1, name)
rank = 0
movie_details = [rank, title, score,gener, country, language, duration]
# 将列表中的信息写入Excel文件表单的单元格中
for index, item in enumerate(movie_details):
sheet.cell(rank + 1, index + 1, item)