
import requests
from lxml import etree
from docx import Document
import time
# 配置
BASE_URL = "https://www.runoob.com/pandas/pandas-tutorial.html"
DOMAIN = "https://www.runoob.com"
OUTPUT_FILE = "Pandas_完整内容_带格式.docx"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
# ==========================================
# 1. 获取所有章节链接
# ==========================================
print("正在获取所有章节...")
resp = requests.get(BASE_URL, headers=HEADERS)
resp.encoding = "utf-8"
tree = etree.HTML(resp.text)
# 获取目录
chapters = []
for a in tree.xpath("//div[@id='leftcolumn']//a"):
title = a.xpath("text()")[0].strip()
href = a.xpath("@href")[0]
url = DOMAIN + href
chapters.append((title, url))
print(f"✅ 找到 {len(chapters)} 个章节")
print("="*70)
# ==========================================
# 2. 逐个抓取内容 + 强制打印到终端
# ==========================================
doc = Document()
for idx, (title, url) in enumerate(chapters, 1):
print(f"\n📚 第 {idx} 章: {title}")
print("-"*70)
# 请求页面
resp = requests.get(url, headers=HEADERS)
resp.encoding = "utf-8"
page_tree = etree.HTML(resp.text)
# ==========================================
# 【关键】直接提取正文所有内容
# ==========================================
content = page_tree.xpath('//div[@class="article-body"]')[0]
# 遍历所有元素,原样输出
result = ""
for node in content.xpath(".//*"):
tag = node.tag
text = "".join(node.xpath(".//text()")).strip()
if not text:
continue
if tag == "h2":
line = f"\n{text}\n"
elif tag == "p":
line = f"{text}\n"
elif tag == "pre":
line = f"\n{text}\n"
elif tag == "li":
line = f"• {text}\n"
else:
continue
# 打印到终端
print(line.strip())
result += line + "\n"
# 写入 Word
doc.add_heading(f"{idx}. {title}", level=1)
doc.add_paragraph(result.strip())
doc.add_paragraph("-" * 60)
time.sleep(0.5)
# 保存
doc.save(OUTPUT_FILE)
print("\n🎉 全部抓取完成!文件已保存:", OUTPUT_FILE)
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。