It is fairly common knowledge that plugins are available to export Wordpress into static files. However, to have your static content generated in far more flexible manner, it's always programming to the rescue.
The script is adapted from the original code found in Jon Thysell's excellent WXR to HTML article.
I modified the original script slightly to download images that are included in each article, as well as saving articles as individual files in YYYY\M\xxx.html format.
This script would take 1 argument as input. It expects the full path to the Wordpress WXR export file (in xml format). The WXR file can be exported by going to Wordpress Admin -> Tools -> Export. I used a full export.
Save the code with a .py extension, then launch command line and navigate to the path where the.py file is locaed. It would be something similar to below:
python D:\Workspace\to-html.py D:\Workspace\wordpress-export-2016.xml
#!/usr/bin/env python
from __future__ import unicode_literals
from urllib.parse import quote
"""
WXR to HMTL <https://jonthysell.com/>
Copyright 2012 Jon Thysell <thysell@gmail.com>
This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
"""
import sys
import os
import codecs
import string
import re
from lxml import etree
from datetime import datetime
import urllib3
_article_header = """
<html>
<head>
<title>%s</title>
<style>
body {
color: #444;
margin: 0 auto;
font-size: 12px;
line-height: 1.5em;
padding: 0;
background: #eee;
font-family: Verdana, sans-serif;
}
h1 {
margin-top: 5px;
margin-bottom: 5px;
font-family: Arial, sans-serif;
font-weight: normal;
font-size: 24px;
line-height: 28px;
color: #222;
}
.date {
margin-bottom: 5px;
font-weight: bold;
color: #666;
}
.entry {
font-size: 14px;
line-height: 1.4em;
}
</style>
</head>
<body style="width:850px;">
"""
_article_footer = """
</body>
</html>
"""
_article_content = u"""
<h1>%s</h1>
<div class="date>%s</div>
<div class="entry">%s</div>
<p><em>Exported from <a href="%s">%s</a></em></p>
"""
_title = ""
_link = ""
_desc = ""
_pubdate = ""
_items = []
# Strip non-ASCII characters for use in file creation
def strip_non_ascii(string):
''' Returns the string without non ASCII characters'''
''' 0-32 (spaces) / 45-46 (period, dash) / 48-57 0-9 / 65-90 A-Z / 97-122 a-z '''
''' stripped = (c for c in string if 0 < ord(c) < 127) '''
stripped = (c for c in string if (0 < ord(c) < 33 or ord(c) == 45 or ord(c) == 46 or 47 < ord(c) < 58 or 64 < ord(c) < 91 or 96 < ord(c) < 123) )
return ''.join(stripped)
# Reads a chunk of text and extracts all <img> tags, download from logical look, and save to the same relative yyyy/mm/dd/ path.
def download_images(string):
matches = re.findall(r'\ssrc="([^"]+)"', string)
http = urllib3.PoolManager()
#matches = ' '.join(matches)
for image in matches:
#print(image)
if "logicallook" in image:
chunks = image.split('/')
path = u"upload/" + chunks[-3] + "/" + chunks[-2] + "/" + chunks[-1]
string = string.replace(image, "../../../" + path)
if not os.path.exists(path):
os.makedirs(os.path.dirname(path), exist_ok=True)
# some images may have unicode names
image = image.replace(chunks[-1],quote(chunks[-1]))
file = http.request('GET', image, preload_content=False)
with open(path,'wb') as output:
while True:
data = file.read(500)
if not data:
break
output.write(data)
file.release_conn()
return(string)
# This controls whether to add in paragraph tags. Most likely you want this on. Only change this to False if for some reason your posts are already valid HMTL.
_autop = True
def autop(s):
s = s.replace("\r\n", "\n")
s = s.replace("\n\n", "</p><p>")
s = s.replace("\n", "<br \>")
return s
def create_html(path, header, content, footer):
os.makedirs(os.path.dirname(path), exist_ok=True)
print ("Writing to %s" % path)
with codecs.open(path, encoding='utf-8', mode='w') as html_file:
html_file.write(header)
html_file.write(content)
html_file.write(footer)
def main(input_file):
"""Take an WXR XML file and export an HMTL file."""
global _title, _link, _desc, _pubdate, _items, _autop
print ("Reading from %s" % input_file)
with codecs.open(input_file, 'r', encoding='utf-8') as wxr_file:
tree = etree.parse(wxr_file)
_title = tree.xpath('/rss/channel/title')[0].text
_link = tree.xpath('/rss/channel/link')[0].text
_desc = tree.xpath('/rss/channel/description')[0].text
_pubdate = tree.xpath('/rss/channel/pubDate')[0].text
xml_items = tree.xpath('/rss/channel/item')
for xml_item in xml_items:
t = xml_item.xpath('title')[0].text
l = xml_item.xpath('link')[0].text
p = xml_item.xpath('pubDate')[0].text
i = xml_item.xpath('wp:post_id', namespaces={'wp': 'http://wordpress.org/export/1.2/'})[0].text
d = xml_item.xpath('wp:post_date', namespaces={'wp': 'http://wordpress.org/export/1.2/'})[0].text[:10]
c = xml_item.xpath('content:encoded', namespaces={'content': 'http://purl.org/rss/1.0/modules/content/'})[0].text
c = download_images(c)
if _autop:
c = autop(c)
_items.append((l, t, p, c))
if i and "?p=" in l:
path = d.replace("-", "/") + "/" + strip_non_ascii(t) + ".html"
header = _article_header % t
contents = (t, d, c.encode('ascii','xmlcharrefreplace'), l, l)
create_html(path, _article_header % t, _article_content % contents, _article_footer)
if __name__ == "__main__":
main(sys.argv[1])
No comments:
Post a Comment