check-links.lua 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. -----------------------------------------------------------------------------
  2. -- Little program that checks links in HTML files, using coroutines and
  3. -- non-blocking I/O via the dispatcher module.
  4. -- LuaSocket sample files
  5. -- Author: Diego Nehab
  6. -----------------------------------------------------------------------------
  7. local url = require("socket.url")
  8. local dispatch = require("dispatch")
  9. local http = require("socket.http")
  10. dispatch.TIMEOUT = 10
  11. -- make sure the user knows how to invoke us
  12. arg = arg or {}
  13. if #arg < 1 then
  14. print("Usage:\n luasocket check-links.lua [-n] {<url>}")
  15. exit()
  16. end
  17. -- '-n' means we are running in non-blocking mode
  18. if arg[1] == "-n" then
  19. -- if non-blocking I/O was requested, use real dispatcher interface
  20. table.remove(arg, 1)
  21. handler = dispatch.newhandler("coroutine")
  22. else
  23. -- if using blocking I/O, use fake dispatcher interface
  24. handler = dispatch.newhandler("sequential")
  25. end
  26. local nthreads = 0
  27. -- get the status of a URL using the dispatcher
  28. function getstatus(link)
  29. local parsed = url.parse(link, {scheme = "file"})
  30. if parsed.scheme == "http" then
  31. nthreads = nthreads + 1
  32. handler:start(function()
  33. local r, c, h, s = http.request{
  34. method = "HEAD",
  35. url = link,
  36. create = handler.tcp
  37. }
  38. if r and c == 200 then io.write('\t', link, '\n')
  39. else io.write('\t', link, ': ', tostring(c), '\n') end
  40. nthreads = nthreads - 1
  41. end)
  42. end
  43. end
  44. function readfile(path)
  45. path = url.unescape(path)
  46. local file, error = io.open(path, "r")
  47. if file then
  48. local body = file:read("*a")
  49. file:close()
  50. return body
  51. else return nil, error end
  52. end
  53. function load(u)
  54. local parsed = url.parse(u, { scheme = "file" })
  55. local body, headers, code, error
  56. local base = u
  57. if parsed.scheme == "http" then
  58. body, code, headers = http.request(u)
  59. if code == 200 then
  60. -- if there was a redirect, update base to reflect it
  61. base = headers.location or base
  62. end
  63. if not body then
  64. error = code
  65. end
  66. elseif parsed.scheme == "file" then
  67. body, error = readfile(parsed.path)
  68. else error = string.format("unhandled scheme '%s'", parsed.scheme) end
  69. return base, body, error
  70. end
  71. function getlinks(body, base)
  72. -- get rid of comments
  73. body = string.gsub(body, "%<%!%-%-.-%-%-%>", "")
  74. local links = {}
  75. -- extract links
  76. body = string.gsub(body, '[Hh][Rr][Ee][Ff]%s*=%s*"([^"]*)"', function(href)
  77. table.insert(links, url.absolute(base, href))
  78. end)
  79. body = string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*'([^']*)'", function(href)
  80. table.insert(links, url.absolute(base, href))
  81. end)
  82. string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*(.-)>", function(href)
  83. table.insert(links, url.absolute(base, href))
  84. end)
  85. return links
  86. end
  87. function checklinks(address)
  88. local base, body, error = load(address)
  89. if not body then print(error) return end
  90. print("Checking ", base)
  91. local links = getlinks(body, base)
  92. for _, link in ipairs(links) do
  93. getstatus(link)
  94. end
  95. end
  96. for _, address in ipairs(arg) do
  97. checklinks(url.absolute("file:", address))
  98. end
  99. while nthreads > 0 do
  100. handler:step()
  101. end