123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176 |
- #!/usr/bin/env python3
- # OpenMPT help file scraper
- # by coda (https://coda.s3m.us/) and Saga Musix (https://sagamusix.de/)
- # This script downloads the OpenMPT manual TOC and then downloads all pages
- # from that TOC. The pages are parsed and all required image files are fetched.
- # The script also generates the appropriate files that can be fed into the
- # HTML Help Workshop to generate a CHM file.
- from urllib.request import urlopen, urlretrieve
- import re, os, shutil, subprocess
- base_url = 'https://wiki.openmpt.org'
- base_url_regex = 'https?://wiki.openmpt.org'
- os.chdir(os.path.dirname(os.path.abspath(__file__)))
- shutil.rmtree('html', ignore_errors=True)
- shutil.copytree('source', 'html')
- style = urlopen(base_url + '/load.php?debug=false&lang=en&modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.page.gallery.styles%7Cmediawiki.skinning.interface%7Cskins.vector.styles%7Csite.styles&only=styles&skin=vector').read().decode('UTF-8')
- # Remove a few unused CSS classes
- style = re.sub(r'\}(\w+)?[\.#]vector([\w >]+)\{.+?\}', '}', style)
- style_file = open('html/style.css', 'w')
- style_file.write(style)
- style_file.close()
- toc_page = urlopen(base_url + '/index.php?title=Manual:_CHM_TOC&action=render').read().decode('UTF-8')
- pages = re.findall('href="' + base_url_regex + '/(.+?)"', toc_page)
- def destname(p):
- p = p.split(':_')[1]
- p = p.replace('/', '_')
- p = p.replace('.', '_')
- while p.find('__') >= 0:
- p = p.replace('__', '_')
- if p.find('#') >= 0:
- parts = p.split('#')
- return parts[0] + '.html#' + parts[1]
- return p + '.html'
-
- def title(p):
- p = p.split(':_')[1]
- p = p.replace('_', ' ')
- return p
- def localurl(p):
- p = destname(p)
- return p
- def replace_images(m):
- global base_url
- filepath = m.group(1) + '/' + m.group(2) + '/'
- filename = m.group(3)
- project.write(filename + "\n")
- urlretrieve(base_url + '/images/' + filepath + filename, 'html/' + filename)
- return '"' + filename + '"'
- def fix_internal_links(m):
- return '<a href="' + localurl(m.group(1)) + '"'
- project = open('html/OpenMPT Manual.hhp', 'w')
- project.write("""[OPTIONS]
- Compatibility=1.1 or later
- Compiled file=OpenMPT Manual.chm
- Contents file=OpenMPT Manual.hhc
- Display compile progress=No
- Full-text search=Yes
- Language=0x409 English (United States)
- Title=OpenMPT Manual
- Default Window=OpenMPT
- Default topic=""" + localurl(pages[0]) + """
- [WINDOWS]
- OpenMPT=,"OpenMPT Manual.hhc",,""" + localurl(pages[0]) + """,,,,,,0x42520,215,0x300e,[20,20,780,580],0xb0000,,,,,,0
- [FILES]
- style.css
- help.css
- bullet.png
- external.png
- """)
- for p in pages:
- content = urlopen(base_url + '/index.php?title=' + p + '&action=render').read().decode('UTF-8')
- # Download and replace image URLs
- content = re.sub(r' srcset=".+?"', '', content);
- content = re.sub(r'"/images/thumb/(\w+)/(\w+)/([^\/]+?)/([^\/]+?)"', replace_images, content)
- content = re.sub(r'"/images/(\w+)/(\w+)/([^\/]+?)"', replace_images, content)
- # Remove comments
- content = re.sub(r'<!--(.+?)-->', '', content, flags = re.DOTALL)
- # Fix local URLs
- content = re.sub(r'<a href="' + base_url_regex + '/File:', '<a href="', content)
- content = re.sub(r'<a href="' + base_url_regex + '/(Manual:.+?)"', fix_internal_links, content)
- content = re.sub(r'<a href="/(Manual:.+?)"', fix_internal_links, content)
- # Remove templates that shouldn't turn up in the manual
- content = re.sub(r'<div class="todo".+?</div>', '', content, flags = re.DOTALL);
- content = re.sub(r'<p class="newversion".+?</p>', '', content, flags = re.DOTALL);
- # Don't need this attribute in our CHM
- content = re.sub(r' rel="nofollow"', '', content);
-
- section = re.match(r'(.+)/', title(p))
- section_str = ''
- if section:
- section_str = section.group(1)
-
- content = """<!DOCTYPE html>
- <html lang="en">
- <head>
- <meta http-equiv="X-UA-Compatible" content="IE=edge">
- <link href="style.css" rel="stylesheet">
- <link href="help.css" rel="stylesheet">
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
- <title>OpenMPT Manual - """ + title(p) + """</title>
- </head>
- <body>
- <h1>""" + title(p) + '</h1><div id="content" class="mw-body">' + content + '</div></body></html>'
-
- saved = open('html/' + destname(p), 'wb')
-
- saved.write(bytes(content, 'UTF-8'))
- saved.close()
-
- project.write(destname(p)+"\n")
- print(p)
-
- project.close()
- # Create TOC
- toc = open('html/OpenMPT Manual.hhc', 'w')
- toc.write("""
- <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
- <HTML>
- <HEAD>
- <meta name="GENERATOR" content="OpenMPT Help Generator">
- <!-- Sitemap 1.0 -->
- </HEAD><BODY>
- <OBJECT type="text/site properties">
- <param name="ImageType" value="Folder">
- </OBJECT>
- """)
- def toc_parse(m):
- return """<OBJECT type="text/sitemap">
- <param name="Name" value=\"""" + m.group(2) + """">
- <param name="Local" value=\"""" + localurl(m.group(1)) + """">
- </OBJECT>"""
- def toc_parse_chapter(m):
- return """<li><OBJECT type="text/sitemap">
- <param name="Name" value=\"""" + m.group(1) + """">
- </OBJECT>"""
- toc_text = re.sub(r'<!--(.+?)-->', '', toc_page, flags = re.DOTALL)
- toc_text = re.sub(r'<div(.+?)>', '', toc_text, flags = re.DOTALL)
- toc_text = re.sub(r'</div>', '', toc_text, flags = re.DOTALL)
- toc_text = re.sub(r'<a href="' + base_url_regex + '/(.+?)".*?>(.+?)</a>', toc_parse, toc_text)
- toc_text = re.sub(r'<li>([^<]+)$', toc_parse_chapter, toc_text, flags = re.MULTILINE)
- toc.write(toc_text)
- toc.write("""
- </BODY></HTML>
- """)
- toc.close()
- if(subprocess.call(['../../build/tools/htmlhelp/hhc.exe', '"html/OpenMPT Manual.hhp"']) != 1):
- raise Exception("Something went wrong during manual creation!")
- try:
- os.remove('../../packageTemplate/html/OpenMPT Manual.chm')
- except OSError:
- pass
- shutil.copy2('html/OpenMPT Manual.chm', '../../packageTemplate/')
|