首页
学习
活动
专区
圈层
工具
发布
社区首页 >专栏 >抓取菜鸟教程

抓取菜鸟教程

原创
作者头像
牛马打工人
发布2026-05-24 12:57:06
发布2026-05-24 12:57:06
1480
举报

import requests

from lxml import etree

from docx import Document

import time

# 配置

BASE_URL = "https://www.runoob.com/pandas/pandas-tutorial.html"

DOMAIN = "https://www.runoob.com"

OUTPUT_FILE = "Pandas_完整内容_带格式.docx"

HEADERS = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"

}

# ==========================================

# 1. 获取所有章节链接

# ==========================================

print("正在获取所有章节...")

resp = requests.get(BASE_URL, headers=HEADERS)

resp.encoding = "utf-8"

tree = etree.HTML(resp.text)

# 获取目录

chapters = []

for a in tree.xpath("//div[@id='leftcolumn']//a"):

title = a.xpath("text()")[0].strip()

href = a.xpath("@href")[0]

url = DOMAIN + href

chapters.append((title, url))

print(f"✅ 找到 {len(chapters)} 个章节")

print("="*70)

# ==========================================

# 2. 逐个抓取内容 + 强制打印到终端

# ==========================================

doc = Document()

for idx, (title, url) in enumerate(chapters, 1):

print(f"\n📚 第 {idx} 章: {title}")

print("-"*70)

# 请求页面

resp = requests.get(url, headers=HEADERS)

resp.encoding = "utf-8"

page_tree = etree.HTML(resp.text)

# ==========================================

# 【关键】直接提取正文所有内容

# ==========================================

content = page_tree.xpath('//div[@class="article-body"]')[0]

# 遍历所有元素,原样输出

result = ""

for node in content.xpath(".//*"):

tag = node.tag

text = "".join(node.xpath(".//text()")).strip()

if not text:

continue

if tag == "h2":

line = f"\n{text}\n"

elif tag == "p":

line = f"{text}\n"

elif tag == "pre":

line = f"\n{text}\n"

elif tag == "li":

line = f"• {text}\n"

else:

continue

# 打印到终端

print(line.strip())

result += line + "\n"

# 写入 Word

doc.add_heading(f"{idx}. {title}", level=1)

doc.add_paragraph(result.strip())

doc.add_paragraph("-" * 60)

time.sleep(0.5)

# 保存

doc.save(OUTPUT_FILE)

print("\n🎉 全部抓取完成!文件已保存:", OUTPUT_FILE)

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档