Wednesday, September 7, 2016

Generate HTML static files from Wordpress WXR using Python

Tested on Python 3.5 (32 bit) for Windows

It is fairly common knowledge that plugins are available to export Wordpress into static files. However, to have your static content generated in far more flexible manner, it's always programming to the rescue.

The script is adapted from the original code found in Jon Thysell's excellent WXR to HTML article.

I modified the original script slightly to download images that are included in each article, as well as saving articles as individual files in YYYY\M\xxx.html format.

This script would take 1 argument as input. It expects the full path to the Wordpress WXR export file (in xml format). The WXR file can be exported by going to Wordpress Admin -> Tools -> Export. I used a full export.



Save the code with a .py extension, then launch command line and navigate to the path where the.py file is locaed. It would be something similar to below:

python D:\Workspace\to-html.py D:\Workspace\wordpress-export-2016.xml

 #!/usr/bin/env python  
 from __future__ import unicode_literals  
 from urllib.parse  import quote  
 """  
 WXR to HMTL <https://jonthysell.com/>  
 Copyright 2012 Jon Thysell <thysell@gmail.com>  
 This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.  
 Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:  
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.  
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.  
 3. This notice may not be removed or altered from any source distribution.  
 """  
 import sys  
 import os  
 import codecs  
 import string  
 import re  
 from lxml import etree  
 from datetime import datetime  
 import urllib3  
 
 _article_header = """  
 <html>  
 <head>  
 <title>%s</title>  
 <style>  
 body {  
   color: #444;  
   margin: 0 auto;  
   font-size: 12px;  
   line-height: 1.5em;  
   padding: 0;  
   background: #eee;  
   font-family: Verdana, sans-serif;  
 }  
 h1 {  
   margin-top: 5px;  
   margin-bottom: 5px;  
   font-family: Arial, sans-serif;  
   font-weight: normal;  
   font-size: 24px;  
   line-height: 28px;  
   color: #222;  
 }  
 .date {  
   margin-bottom: 5px;  
   font-weight: bold;  
   color: #666;  
 }  
 .entry {  
   font-size: 14px;  
   line-height: 1.4em;  
 }  
 </style>  
 </head>  
 <body style="width:850px;">  
 """  
 _article_footer = """  
 </body>  
 </html>  
 """  
 _article_content = u"""  
 <h1>%s</h1>  
 <div class="date>%s</div>  
 <div class="entry">%s</div>  
 <p><em>Exported from <a href="%s">%s</a></em></p>  
 """  
 _title = ""  
 _link = ""  
 _desc = ""  
 _pubdate = ""  
 _items = []  
 # Strip non-ASCII characters for use in file creation  
 def strip_non_ascii(string):  
   ''' Returns the string without non ASCII characters'''  
   ''' 0-32 (spaces) / 45-46 (period, dash) / 48-57 0-9 / 65-90 A-Z / 97-122 a-z '''  
   ''' stripped = (c for c in string if 0 < ord(c) < 127) '''  
   stripped = (c for c in string if (0 < ord(c) < 33 or ord(c) == 45 or ord(c) == 46 or 47 < ord(c) < 58 or 64 < ord(c) < 91 or 96 < ord(c) < 123) )  
   return ''.join(stripped)  
 # Reads a chunk of text and extracts all <img> tags, download from logical look, and save to the same relative yyyy/mm/dd/ path.  
 def download_images(string):  
   matches = re.findall(r'\ssrc="([^"]+)"', string)  
   http = urllib3.PoolManager()  
   #matches = ' '.join(matches)  
   for image in matches:  
     #print(image)  
     if "logicallook" in image:  
       chunks = image.split('/')  
       path = u"upload/" + chunks[-3] + "/" + chunks[-2] + "/" + chunks[-1]  
       string = string.replace(image, "../../../" + path)  
       if not os.path.exists(path):  
         os.makedirs(os.path.dirname(path), exist_ok=True)  
         # some images may have unicode names  
         image = image.replace(chunks[-1],quote(chunks[-1]))  
         file = http.request('GET', image, preload_content=False)  
         with open(path,'wb') as output:  
           while True:  
             data = file.read(500)  
             if not data:  
               break  
             output.write(data)  
         file.release_conn()  
   return(string)  
 # This controls whether to add in paragraph tags. Most likely you want this on. Only change this to False if for some reason your posts are already valid HMTL.  
 _autop = True  
 def autop(s):  
   s = s.replace("\r\n", "\n")  
   s = s.replace("\n\n", "</p><p>")  
   s = s.replace("\n", "<br \>")  
   return s  
 def create_html(path, header, content, footer):  
   os.makedirs(os.path.dirname(path), exist_ok=True)  
   print ("Writing to %s" % path)  
   with codecs.open(path, encoding='utf-8', mode='w') as html_file:  
     html_file.write(header)  
     html_file.write(content)  
     html_file.write(footer)  
 def main(input_file):  
   """Take an WXR XML file and export an HMTL file."""  
   global _title, _link, _desc, _pubdate, _items, _autop  
   print ("Reading from %s" % input_file)  
   with codecs.open(input_file, 'r', encoding='utf-8') as wxr_file:  
     tree = etree.parse(wxr_file)  
     _title = tree.xpath('/rss/channel/title')[0].text  
     _link = tree.xpath('/rss/channel/link')[0].text  
     _desc = tree.xpath('/rss/channel/description')[0].text  
     _pubdate = tree.xpath('/rss/channel/pubDate')[0].text  
     xml_items = tree.xpath('/rss/channel/item')  
     for xml_item in xml_items:  
       t = xml_item.xpath('title')[0].text  
       l = xml_item.xpath('link')[0].text  
       p = xml_item.xpath('pubDate')[0].text  
       i = xml_item.xpath('wp:post_id', namespaces={'wp': 'http://wordpress.org/export/1.2/'})[0].text  
       d = xml_item.xpath('wp:post_date', namespaces={'wp': 'http://wordpress.org/export/1.2/'})[0].text[:10]  
       c = xml_item.xpath('content:encoded', namespaces={'content': 'http://purl.org/rss/1.0/modules/content/'})[0].text  
       c = download_images(c)  
       if _autop:  
         c = autop(c)  
       _items.append((l, t, p, c))  
       if i and "?p=" in l:  
         path = d.replace("-", "/") + "/" + strip_non_ascii(t) + ".html"  
         header = _article_header % t  
         contents = (t, d, c.encode('ascii','xmlcharrefreplace'), l, l)  
         create_html(path, _article_header % t, _article_content % contents, _article_footer)  

 if __name__ == "__main__":  
   main(sys.argv[1])  

No comments:

Related Posts Plugin for WordPress, Blogger...