python中beautifulsoup是什么_python中BeautifulSoup的基本使用

基本使用

实例1:

html = """

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were

,

Lacie and

Tillie;

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.title)

print(type(soup.title))

print(soup.head)

print(soup.p)

#输出:

The Dormouse's storyThe Dormouse's story

The Dormouse's story

获取名称

html = """

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were

,

Lacie and

Tillie;

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

#输出:

title #获取的是标签的名称

获取属性

html = """

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were

,

Lacie and

Tillie;

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.attrs['name'])

print(soup.p['name'])

#输出:

dromouse

dromouse

获取内容

html = """

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were

,

Lacie and

Tillie;

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.string)

#输出:

The Dormouse's story

嵌套选择

html = """

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were

,

Lacie and

Tillie;

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.head.title.string)

#输出:

The Dormouse's story

子节点和子孙节点

实例1:

html = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.contents)#子节点

#输出:

['\n Once upon a time there were three little sisters; and their names were\n ',

Elsie

, '\n', Lacie, ' \n and\n ', Tillie, '\n and they lived at the bottom of a well.\n ']

In [8]:

实例2:

html = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.children)#也是子节点,但是是以遍历的方式输出

for i, child in enumerate(soup.p.children):

print(i, child)

#输出:

0

Once upon a time there were three little sisters; and their names were

1

Elsie

2

3 Lacie

4

and

5 Tillie

6

and they lived at the bottom of a well.

实例3:

html = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.descendants)

for i, child in enumerate(soup.p.descendants):#子孙节点,会输出子节点和孙节点

print(i, child)

#输出:

0

Once upon a time there were three little sisters; and their names were

1

Elsie

2

3 Elsie

4 Elsie

5

6

7 Lacie

8 Lacie

9

and

父节点和祖先节点

实例2:

html = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.a.parent)#a的父节点输出

#输出

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

and they lived at the bottom of a well.

实例2:

html = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(list(enumerate(soup.a.parents)))#会输出a的所有父级的节点,也就是所有p标签的内容

兄弟节点

html = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

and they lived at the bottom of a well.

...

"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(list(enumerate(soup.a.next_siblings)))#输出第一个a标签后面的,与a同级的内容,不一定包含在a内,只要和a同级即可。

print(list(enumerate(soup.a.previous_siblings)))#输出第一个a前面的,与a同级的内容

#输出:

[(0, '\n'), (1, Lacie), (2, ' \n and\n '), (3, Tillie), (4, '\n and they lived at the bottom of a well.\n ')]

[(0, '\n Once upon a time there were three little sisters; and their names were\n ')]

标准选择器

find_all( name , attrs , recursive , text , **kwargs ) #可根据标签名、属性、内容查找文档

实例1:

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all('ul'))

print(type(soup.find_all('ul')[0]))

#输出:

[

  • Foo
  • Bar
  • Jay
,
  • Foo
  • Bar
]

attrs

实例1:

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all(attrs={'id': 'list-1'}))

print(soup.find_all(attrs={'name': 'elements'}))

#输出:

[

  • Foo
  • Bar
  • Jay
]

[

  • Foo
  • Bar
  • Jay
]

实例2:

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all(id='list-1'))

print(soup.find_all(class_='element'))

#输出:

[

  • Foo
  • Bar
  • Jay
]

[

Foo, Bar, Jay, Foo, Bar]

text

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find_all(text='Foo'))

#输出:['Foo', 'Foo']

find( name , attrs , recursive , text , **kwargs )

find返回单个元素,find_all返回所有元素

实例:

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.find('ul'))

print(type(soup.find('ul')))

print(soup.find('page'))

#输出:

  • Foo
  • Bar
  • Jay

None

CSS选择器#通过select()直接传入CSS选择器即可完成选择

实例1:

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.select('.panel .panel-heading'))

print(soup.select('ul li'))

print(soup.select('#list-2 .element'))

print(type(soup.select('ul')[0]))

#输出:

[

Hello

]

[

Foo, Bar, Jay, Foo, Bar]

[

Foo, Bar]

实例2:

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for ul in soup.select('ul'):

print(ul.select('li'))

#输出:

[

Foo, Bar, Jay]

[

Foo, Bar]

获取属性

实例:

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for ul in soup.select('ul'):

print(ul['id'])

print(ul.attrs['id'])

输出:

list-1

list-1

list-2

list-2

获取内容

实例1:

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar

'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

for li in soup.select('li'):

print(li.get_text())

#输出:

Foo

Bar

Jay

Foo

Bar

你可能感兴趣的:(python中beautifulsoup是什么_python中BeautifulSoup的基本使用)