html_doc = """ <html><head><title>My Home Page</title></head> <body> <!-- 这是个注释 --> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <p>...<a href="http://example.com/test">Test Link</a>...</p> </body></html> """ soup_example = BeautifulSoup(html_doc, 'lxml')
for child in body_tag.children: if child.name: # 只打印 Tag 对象 print(f"Body 的子标签: {child.name}") # p # p
5.3 parent 和 parents
parent:访问元素的父节点。
parents:返回一个生成器,可迭代地获取所有祖先节点。
1 2 3 4 5 6 7 8 9 10
a_tag = soup_example.a print(f"\n第一个 A 标签的父节点: {a_tag.parent.name}") # p print(f"第一个 A 标签的所有祖先节点:") for parent in a_tag.parents: if parent.name: print(parent.name) # p # body # html # [document]
# 查找所有 p 标签 all_p_tags = soup_example.select('p') print(f"\n通过 CSS 选择器查找所有 P 标签:\n{all_p_tags}")
# 查找 class 为 sister 的 a 标签 sister_a_tags = soup_example.select('a.sister') print(f"\n通过 CSS 选择器查找 class='sister' 的 A 标签:") for tag in sister_a_tags: print(tag.string)
# 查找 p 标签下的所有 a 标签 p_a_tags = soup_example.select('p a') print(f"\n查找 p 标签下的所有 a 标签:\n{p_a_tags}")
# 结构化选择器: 查找父元素 p 并且 class 是 story 的 a 元素 story_a_tags = soup_example.select('p.story > a') print(f"\n在 class='story' 的 p 标签下的直接子 a 标签:\n{story_a_tags}")
print(f"P 标签的 text (所有子标签文本): {p_tag.text}") # Once upon a time there were three little sisters; and their names were # Elsie, # Lacie and # Tillie; # and they lived at the bottom of a well.
print(f"P 标签的 get_text(separator='|', strip=True):\n{p_tag.get_text(separator='|', strip=True)}") # Once upon a time there were three little sisters;|and their names were|Elsie,|Lacie and|Tillie;|and they lived at the bottom of a well.
八、常见爬虫流程示例
sequenceDiagram
participant User as 用户
participant PythonScript as Python 脚本
participant WebServer as 目标网站服务器
User->>PythonScript: 运行爬虫脚本
PythonScript->>WebServer: 1. 发送 HTTP 请求 (requests.get(url))
WebServer->>PythonScript: 2. 返回 HTML 响应
PythonScript->>PythonScript: 3. 使用 Beautiful Soup 解析 HTML (BeautifulSoup(html_content, 'lxml'))
PythonScript->>PythonScript: 4. 遍历/搜索解析树 (find_all(), select())
PythonScript->>PythonScript: 5. 提取所需数据 (tag.get('attr'), tag.text)
PythonScript->>PythonScript: 6. 数据清洗与存储 (CSV/JSON/DB)
PythonScript->>User: 7. 提供抓取结果
# 假设导航链接在 nav 标签中,并且是 ul > li > a 的结构 # 这需要根据实际网页结构调整 nav_links = soup.select('nav ul li a') links_data = [] for link in nav_links: text = link.text.strip() href = link.get('href') if text and href: # 确保文本和链接都存在 links_data.append({'text': text, 'href': href}) return links_data
except requests.exceptions.RequestException as e: print(f"请求失败: {e}") return []