wiki.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. #!/usr/bin/env python3
  2. # OpenMPT help file scraper
  3. # by coda (https://coda.s3m.us/) and Saga Musix (https://sagamusix.de/)
  4. # This script downloads the OpenMPT manual TOC and then downloads all pages
  5. # from that TOC. The pages are parsed and all required image files are fetched.
  6. # The script also generates the appropriate files that can be fed into the
  7. # HTML Help Workshop to generate a CHM file.
  8. from urllib.request import urlopen, urlretrieve
  9. import re, os, shutil, subprocess
  10. base_url = 'https://wiki.openmpt.org'
  11. base_url_regex = 'https?://wiki.openmpt.org'
  12. os.chdir(os.path.dirname(os.path.abspath(__file__)))
  13. shutil.rmtree('html', ignore_errors=True)
  14. shutil.copytree('source', 'html')
  15. style = urlopen(base_url + '/load.php?debug=false&lang=en&modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.page.gallery.styles%7Cmediawiki.skinning.interface%7Cskins.vector.styles%7Csite.styles&only=styles&skin=vector').read().decode('UTF-8')
  16. # Remove a few unused CSS classes
  17. style = re.sub(r'\}(\w+)?[\.#]vector([\w >]+)\{.+?\}', '}', style)
  18. style_file = open('html/style.css', 'w')
  19. style_file.write(style)
  20. style_file.close()
  21. toc_page = urlopen(base_url + '/index.php?title=Manual:_CHM_TOC&action=render').read().decode('UTF-8')
  22. pages = re.findall('href="' + base_url_regex + '/(.+?)"', toc_page)
  23. def destname(p):
  24. p = p.split(':_')[1]
  25. p = p.replace('/', '_')
  26. p = p.replace('.', '_')
  27. while p.find('__') >= 0:
  28. p = p.replace('__', '_')
  29. if p.find('#') >= 0:
  30. parts = p.split('#')
  31. return parts[0] + '.html#' + parts[1]
  32. return p + '.html'
  33. def title(p):
  34. p = p.split(':_')[1]
  35. p = p.replace('_', ' ')
  36. return p
  37. def localurl(p):
  38. p = destname(p)
  39. return p
  40. def replace_images(m):
  41. global base_url
  42. filepath = m.group(1) + '/' + m.group(2) + '/'
  43. filename = m.group(3)
  44. project.write(filename + "\n")
  45. urlretrieve(base_url + '/images/' + filepath + filename, 'html/' + filename)
  46. return '"' + filename + '"'
  47. def fix_internal_links(m):
  48. return '<a href="' + localurl(m.group(1)) + '"'
  49. project = open('html/OpenMPT Manual.hhp', 'w')
  50. project.write("""[OPTIONS]
  51. Compatibility=1.1 or later
  52. Compiled file=OpenMPT Manual.chm
  53. Contents file=OpenMPT Manual.hhc
  54. Display compile progress=No
  55. Full-text search=Yes
  56. Language=0x409 English (United States)
  57. Title=OpenMPT Manual
  58. Default Window=OpenMPT
  59. Default topic=""" + localurl(pages[0]) + """
  60. [WINDOWS]
  61. OpenMPT=,"OpenMPT Manual.hhc",,""" + localurl(pages[0]) + """,,,,,,0x42520,215,0x300e,[20,20,780,580],0xb0000,,,,,,0
  62. [FILES]
  63. style.css
  64. help.css
  65. bullet.png
  66. external.png
  67. """)
  68. for p in pages:
  69. content = urlopen(base_url + '/index.php?title=' + p + '&action=render').read().decode('UTF-8')
  70. # Download and replace image URLs
  71. content = re.sub(r' srcset=".+?"', '', content);
  72. content = re.sub(r'"/images/thumb/(\w+)/(\w+)/([^\/]+?)/([^\/]+?)"', replace_images, content)
  73. content = re.sub(r'"/images/(\w+)/(\w+)/([^\/]+?)"', replace_images, content)
  74. # Remove comments
  75. content = re.sub(r'<!--(.+?)-->', '', content, flags = re.DOTALL)
  76. # Fix local URLs
  77. content = re.sub(r'<a href="' + base_url_regex + '/File:', '<a href="', content)
  78. content = re.sub(r'<a href="' + base_url_regex + '/(Manual:.+?)"', fix_internal_links, content)
  79. content = re.sub(r'<a href="/(Manual:.+?)"', fix_internal_links, content)
  80. # Remove templates that shouldn't turn up in the manual
  81. content = re.sub(r'<div class="todo".+?</div>', '', content, flags = re.DOTALL);
  82. content = re.sub(r'<p class="newversion".+?</p>', '', content, flags = re.DOTALL);
  83. # Don't need this attribute in our CHM
  84. content = re.sub(r' rel="nofollow"', '', content);
  85. section = re.match(r'(.+)/', title(p))
  86. section_str = ''
  87. if section:
  88. section_str = section.group(1)
  89. content = """<!DOCTYPE html>
  90. <html lang="en">
  91. <head>
  92. <meta http-equiv="X-UA-Compatible" content="IE=edge">
  93. <link href="style.css" rel="stylesheet">
  94. <link href="help.css" rel="stylesheet">
  95. <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  96. <title>OpenMPT Manual - """ + title(p) + """</title>
  97. </head>
  98. <body>
  99. <h1>""" + title(p) + '</h1><div id="content" class="mw-body">' + content + '</div></body></html>'
  100. saved = open('html/' + destname(p), 'wb')
  101. saved.write(bytes(content, 'UTF-8'))
  102. saved.close()
  103. project.write(destname(p)+"\n")
  104. print(p)
  105. project.close()
  106. # Create TOC
  107. toc = open('html/OpenMPT Manual.hhc', 'w')
  108. toc.write("""
  109. <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
  110. <HTML>
  111. <HEAD>
  112. <meta name="GENERATOR" content="OpenMPT Help Generator">
  113. <!-- Sitemap 1.0 -->
  114. </HEAD><BODY>
  115. <OBJECT type="text/site properties">
  116. <param name="ImageType" value="Folder">
  117. </OBJECT>
  118. """)
  119. def toc_parse(m):
  120. return """<OBJECT type="text/sitemap">
  121. <param name="Name" value=\"""" + m.group(2) + """">
  122. <param name="Local" value=\"""" + localurl(m.group(1)) + """">
  123. </OBJECT>"""
  124. def toc_parse_chapter(m):
  125. return """<li><OBJECT type="text/sitemap">
  126. <param name="Name" value=\"""" + m.group(1) + """">
  127. </OBJECT>"""
  128. toc_text = re.sub(r'<!--(.+?)-->', '', toc_page, flags = re.DOTALL)
  129. toc_text = re.sub(r'<div(.+?)>', '', toc_text, flags = re.DOTALL)
  130. toc_text = re.sub(r'</div>', '', toc_text, flags = re.DOTALL)
  131. toc_text = re.sub(r'<a href="' + base_url_regex + '/(.+?)".*?>(.+?)</a>', toc_parse, toc_text)
  132. toc_text = re.sub(r'<li>([^<]+)$', toc_parse_chapter, toc_text, flags = re.MULTILINE)
  133. toc.write(toc_text)
  134. toc.write("""
  135. </BODY></HTML>
  136. """)
  137. toc.close()
  138. if(subprocess.call(['../../build/tools/htmlhelp/hhc.exe', '"html/OpenMPT Manual.hhp"']) != 1):
  139. raise Exception("Something went wrong during manual creation!")
  140. try:
  141. os.remove('../../packageTemplate/html/OpenMPT Manual.chm')
  142. except OSError:
  143. pass
  144. shutil.copy2('html/OpenMPT Manual.chm', '../../packageTemplate/')