网址转Markdown
原创2025年3月10日大约 1 分钟
网址转Markdown
依赖package
requirements.txt
beautifulsoup4
markdownify
requests
main程序
main.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2025-03-10 08:36
# @Author : Jack
# @File : main
"""
main
"""
import os
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from urllib.parse import urljoin, urlparse
# 获取网页内容
url = 'https://nstudy.org/p2025/99/d97d89.html'
response = requests.get(url)
# 图片下载保存路径
download_folder = 'images'
os.makedirs(download_folder, exist_ok=True)
# 如果请求成功
if response.status_code == 200:
html_content = response.text
# 使用 BeautifulSoup 解析 HTML,获取所有图片标签
soup = BeautifulSoup(html_content, 'html.parser')
images = soup.find_all('img')
# 下载图片并保存
for img in images:
img_url = img.get('src')
if not img_url:
continue
print(url, img_url)
img_url = urljoin(url, img_url)
print(url, img_url)
# 获取图片文件名(从 URL 中提取)
img_name = os.path.basename(urlparse(img_url).path)
img_path = os.path.join(download_folder, img_name)
# 下载图片并保存
try:
img_response = requests.get(img_url)
with open(img_path, 'wb') as img_file:
img_file.write(img_response.content)
print(f"图片已下载: {img_path}")
# 更新 HTML 中的图片链接为本地路径
img['src'] = os.path.join(download_folder, img_name)
except Exception as e:
print(f"下载图片失败: {img_url},错误: {e}")
# 转换 HTML 为 Markdown
markdown_content = md(str(soup))
# 输出或保存到文件
with open('output.md', 'w', encoding='utf-8') as f:
f.write(markdown_content)
print("转换完成,Markdown 文件已保存为 output.md")
else:
print(f"网页请求失败,状态码:{response.status_code}")
运行步骤
- 安装依赖包
pip install -r requirements.txt
- 运行main.py
python main.py
- 微调code来满足自己的需求
其他情况
本程序运行在
python 3.11.7