抓取菜鸟教程

原创

牛马打工人

发布于 2026-05-24 12:57:06

1480

import requests

from lxml import etree

from docx import Document

import time

# 配置

BASE_URL = "https://www.runoob.com/pandas/pandas-tutorial.html"

DOMAIN = "https://www.runoob.com"

OUTPUT_FILE = "Pandas_完整内容_带格式.docx"

HEADERS = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"

}

# ==========================================

# 1. 获取所有章节链接

# ==========================================

print("正在获取所有章节...")

resp = requests.get(BASE_URL, headers=HEADERS)

resp.encoding = "utf-8"

tree = etree.HTML(resp.text)

# 获取目录

chapters = []

for a in tree.xpath("//div[@id='leftcolumn']//a"):

title = a.xpath("text()")[0].strip()

href = a.xpath("@href")[0]

url = DOMAIN + href

chapters.append((title, url))

print(f"✅ 找到 {len(chapters)} 个章节")

print("="*70)

# ==========================================

# 2. 逐个抓取内容 + 强制打印到终端

# ==========================================

doc = Document()

for idx, (title, url) in enumerate(chapters, 1):

print(f"\n📚 第 {idx} 章: {title}")

print("-"*70)

# 请求页面

resp = requests.get(url, headers=HEADERS)

resp.encoding = "utf-8"

page_tree = etree.HTML(resp.text)

# ==========================================

# 【关键】直接提取正文所有内容

# ==========================================

content = page_tree.xpath('//div[@class="article-body"]')[0]

# 遍历所有元素，原样输出

result = ""

for node in content.xpath(".//*"):

tag = node.tag

text = "".join(node.xpath(".//text()")).strip()

if not text:

continue

if tag == "h2":

line = f"\n{text}\n"

elif tag == "p":

line = f"{text}\n"

elif tag == "pre":

line = f"\n{text}\n"

elif tag == "li":

line = f"• {text}\n"

else:

continue

# 打印到终端

print(line.strip())

result += line + "\n"

# 写入 Word

doc.add_heading(f"{idx}. {title}", level=1)

doc.add_paragraph(result.strip())

doc.add_paragraph("-" * 60)

time.sleep(0.5)

# 保存

doc.save(OUTPUT_FILE)

print("\n🎉 全部抓取完成！文件已保存：", OUTPUT_FILE)

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

python

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

python

登录后参与评论

0 条评论

热度

抓取菜鸟教程

抓取菜鸟教程

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐