1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
import os
import xml.etree.ElementTree as ET
import requests
import mimetypes
import html2text
import re
# 加载XML文件
tree = ET.parse('CNBlogs_BlogBackup.xml')
root = tree.getroot()
namespace = {'atom': 'http://www.w3.org/2005/Atom'}
def sanitize_filename(name):
invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
for char in invalid_chars:
name = name.replace(char, '_')
return name
# 决定文件类型
def determine_extension(image_url, content_type):
url_extension = os.path.splitext(image_url.split('?')[0].split('#')[0])[-1].lower()
if url_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']:
return url_extension
if content_type:
mime_extension = mimetypes.guess_extension(content_type.split(';')[0].strip())
if mime_extension:
return mime_extension
return '.jpg'
# 下载图片并保存到文件夹里
def download_image(image_url, folder_path):
try:
if image_url.startswith('//'):
image_url = 'https:' + image_url
elif not image_url.startswith(('http://', 'https://')):
image_url = 'https://' + image_url
# 添加超时提醒
response = requests.get(image_url, timeout=10)
if response.status_code == 200:
# 获取图片类型
content_type = response.headers.get('Content-Type', '')
extension = determine_extension(image_url, content_type)
image_name = sanitize_filename(image_url.split('/')[-1].split('?')[0].split('#')[0])
if not image_name.lower().endswith(('.jpg', '.png', '.gif')):
image_name += extension
image_path = os.path.join(folder_path, image_name)
with open(image_path, 'wb') as f:
f.write(response.content)
return image_name
else:
print(f"Failed to download image from {image_url}, status code: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Error downloading image from {image_url}: {e}")
return None
# 寻找并下载所有md中的图片链接
def replace_images(md_content, folder_path):
image_pattern = re.compile(r'!\[.*?\]\((.*?)\)')
matches = image_pattern.findall(md_content)
for image_url in matches:
print("download img : {0}".format(image_url))
local_image_name = download_image(image_url, folder_path)
if local_image_name:
local_image_path = f"{local_image_name}"
md_content = md_content.replace(image_url, local_image_path)
return md_content
# 遍历xml中的blog
for entry in root.findall('atom:entry', namespace):
title = entry.find('atom:title', namespace).text
# 从标题中移除作者,xml中的标题是带着作者名字的
if '-你的名字' in title:
title = title.replace('-你的名字', '').strip()
content = entry.find('atom:content', namespace).text
published_date = entry.find('atom:published', namespace).text
# 命名文件夹把空格替换了
folder_name = sanitize_filename(title.replace(' ', '_'))
# 每个blog post都需要一个文件夹
folder_path = os.path.join(os.getcwd(), folder_name)
os.makedirs(folder_path, exist_ok=True)
if content.strip().startswith('<'):
# 使用html2text把html的部分替换成md
md_content = html2text.html2text(content)
else:
md_content = content
# 把网络图床的图片路径替换成本地路径
md_content = replace_images(md_content, folder_path)
# md文件头添加博客时间和内容
md_file_content = f"---\nTitle: {title}\nDate: {published_date}\n---\n\n{md_content}\n"
# md文件保存为index.md表示当前博客内容
md_filename = os.path.join(folder_path, 'index.md')
with open(md_filename, 'w', encoding='utf-8') as md_file:
md_file.write(md_file_content)
print("Conversion complete!")
|