
import requests
from lxml import etree
from pptx import Presentation
from pptx.util import Pt
from pptx.dml.color import RGBColor
import time
BASE_URL = "https://www.runoob.com/pandas/pandas-tutorial.html"
DOMAIN = "https://www.runoob.com"
PPT_NAME = "Pandas教程.pptx"
HEADERS = {"User-Agent": "Mozilla/5.0"}
def get_html_tree(url):
res = requests.get(url, headers=HEADERS, timeout=10)
res.encoding = "utf-8"
return etree.HTML(res.text)
# 初始化PPT
prs = Presentation()
blank_layout = prs.slide_layouts[6]
# 获取所有章节
root_tree = get_html_tree(BASE_URL)
a_list = root_tree.xpath('//div[@id="leftcolumn"]//a')
chapters = []
for a in a_list:
title = a.xpath("text()")[0].strip()
href = a.xpath("@href")[0]
chapters.append((title, DOMAIN + href))
print(f"共获取 {len(chapters)} 个章节\n")
# 逐章生成PPT
for idx, (chap_title, page_url) in enumerate(chapters, 1):
print(f"===== 第{idx}章:{chap_title} =====")
try:
page_tree = get_html_tree(page_url)
body = page_tree.xpath('//div[@class="article-body"]')[0]
except:
print("⚠️ 未找到内容,跳过")
continue
# 新建一页PPT
slide = prs.slides.add_slide(blank_layout)
# 添加章节标题(修复了 add_run 报错)
title_box = slide.shapes.add_textbox(Pt(10), Pt(5), Pt(10), Pt(2))
tf_title = title_box.text_frame
p_title = tf_title.paragraphs[0]
p_title.text = chap_title # 这里修复了!
p_title.font.size = Pt(20)
p_title.font.bold = True
# 添加正文
content_box = slide.shapes.add_textbox(Pt(10), Pt(12), Pt(80), Pt(40))
tf_content = content_box.text_frame
tf_content.word_wrap = True
# 遍历内容并打印到终端 + 写入PPT
for node in body.iterchildren():
tag = node.tag
text = "".join(node.xpath(".//text()")).strip()
if not text:
continue
# 终端打印(你要的!)
print(text)
# PPT 段落
p = tf_content.add_paragraph()
p.text = text
p.font.size = Pt(12)
# 标题加大
if tag == "h2":
p.font.bold = True
p.font.size = Pt(16)
print("-" * 60)
time.sleep(0.5)
# 保存PPT
prs.save(PPT_NAME)
print("\n✅ 全部完成!PPT 已保存为:", PPT_NAME)
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。