msspec_python3/doc/source/sitemap-generate.py

73 lines
2.9 KiB
Python
Raw Normal View History

# coding: utf8
import os, fnmatch
from datetime import datetime
from pytz import timezone
import re
from lxml import etree
import argparse
opt_defaults = {'include': ".*\.html|.*\.pdf",
'exclude': "google.*\.html",
'timezone': 'Europe/Paris'}
parser = argparse.ArgumentParser(description="Create an XML sitemap file from a source html folder.")
parser.add_argument("--url", help="The website address.")
parser.add_argument("--path", help="The root folder to explore.")
parser.add_argument("--include", help="The include regexp pattern.", default=opt_defaults['include'])
parser.add_argument("--exclude", help="The exclude regexp pattern.", default=opt_defaults['exclude'])
parser.add_argument("--timezone", help="The time zone.", default=opt_defaults['timezone'])
parser.add_argument("--priority", help="The page priority.", type=float, default=.8)
parser.add_argument("--changefreq", help="How often the page is likely to change.", default="monthly",
choices=["always", "hourly", "daily", "weekly", "monthly", "yearly", "never"])
parser.add_argument("output", help="The output file name.")
opts = parser.parse_args()
defaults = {'loc': '', 'lastmod': '', 'priority': "{:.1f}".format(opts.priority), 'changefreq': opts.changefreq}
results = []
for root, dirs, files in os.walk(opts.path):
for fn in files:
if re.match(opts.include, fn) and not re.match(opts.exclude, fn):
fullfn = os.path.join(root, fn)
loc = os.path.join(opts.url, fn)
dt = datetime.fromtimestamp(os.stat(fullfn).st_ctime, tz=timezone(opts.timezone))
tzd = dt.strftime("%z")
tzd = tzd[:-2] + ":" + tzd[-2:]
lastmod = dt.strftime("%Y-%m-%dT%H:%M") + tzd
data = defaults.copy()
data.update(loc=loc, lastmod=lastmod)
results.append(data)
# convert to xml
class NameSpace:
xsi = "http://www.w3.org/2001/XMLSchema-instance"
root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9", nsmap={'xsi': NameSpace.xsi})
root.attrib[etree.QName(NameSpace.xsi, 'schemaLocation')]= ("http://www.sitemaps.org/schemas/sitemap/0.9 "
"http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd")
comment = etree.Comment("Auto-generated on {}".format(datetime.now().ctime()))
root.insert(1, comment)
for data in results:
url_el = etree.SubElement(root, "url")
loc_el = etree.SubElement(url_el, "loc")
lastmod_el = etree.SubElement(url_el, "lastmod")
changefreq_el = etree.SubElement(url_el, "changefreq")
priority_el = etree.SubElement(url_el, "priority")
loc_el.text = data['loc']
lastmod_el.text = data['lastmod']
priority_el.text = data['priority']
changefreq_el.text = data['changefreq']
with open(opts.output, 'w') as fd:
fd.write(etree.tostring(root, encoding='UTF-8', xml_declaration=True, pretty_print=True).decode('utf8'))