Python获取网页里全部网址

from selenium import webdriver
import re
url="http://www.baidu.com/"
brower = webdriver.Firefox()
brower.get(url)
pagesoures = brower.page_source  # 抓取网页源代码
restr = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'  # 如果不带括号会输出全部,只要()内的数据如果政策抓取不到也许他前面有空格
rex = re.compile(restr, re.IGNORECASE)
mylist = rex.findall(pagesoures)
brower.close()
print(mylist)
data = open("D:\data.txt", 'w+')
print(mylist, file=data)
data.close()

python读入写出
python常用的读取文件函数有三种read()、readline()、readlines()
read() 一次性读全部内容

with open("test.txt", "r") as f:    #打开文件
    data = f.read()   #读取文件
    print(data)

readline() 读取第一行内容

with open("test.txt", "r") as f:
    data = f.readline()
    print(data)

readlines() 列表

with open("test.txt", "r") as f:
    data = f.readlines()
    print(data)

会出现换行

with open("test.txt", "r") as f:
    for line in f.readlines():
        line = line.strip('\n')  #去掉列表中每一个元素的换行符
        print(line)

write

with open("test.txt","w") as f:
        f.write("这是个测试!")  #这句话自带文件关闭功能,不需要再写f.close()

print文件中

data=open("D:\data.txt",'w+') 
  print('这是个测试',file=data)
  data.close()

也可以用pip install requests库来进行爬取数据

你可能感兴趣的:(Python获取网页里全部网址)