check-web-links.nut 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. auto trget = table_rawget;
  2. auto trset = table_rawset;
  3. int_t curl_reader(buffer, size, nitems, udata)
  4. {
  5. //print("***Got", udata, buffer.len(), size, nitems, "\n", buffer, "\n");
  6. udata.write(buffer);
  7. return buffer.len();
  8. }
  9. auto curl = EasyCurl();
  10. //curl.set_writer(data);
  11. //curl.set_writer(curl_reader);
  12. //curl.setopt( curl.CURLOPT_VERBOSE, 1 );
  13. //accept compressed
  14. curl.setopt( curl.CURLOPT_ACCEPT_ENCODING, "" );
  15. //insecure
  16. //curl.setopt( curl.CURLOPT_SSL_VERIFYPEER, 0 );
  17. curl.setopt( curl.CURLOPT_FOLLOWLOCATION, 1);
  18. int_t curl_reader_header(buffer, size, nitems, udata)
  19. {
  20. //print("***Got", udata, buffer.len(), size, nitems, "\n", buffer, "\n");
  21. udata.write(buffer);
  22. return buffer.len() == 2 ? 0 : buffer.len(); //2 == \n\n after headers
  23. }
  24. auto getUrlHeadersData(url)
  25. {
  26. auto data = blob(0, 8192);
  27. curl.set_writer(curl_reader_header, data);
  28. curl.setopt( curl.CURLOPT_HEADER, 1);
  29. curl.setopt( curl.CURLOPT_URL, url );
  30. auto result = curl.perform();
  31. curl.setopt( curl.CURLOPT_HEADER, 0);
  32. return data.tostring();
  33. }
  34. auto getUrlData(url)
  35. {
  36. auto data = blob(0, 8192);
  37. curl.set_writer(curl_reader, data);
  38. curl.setopt( curl.CURLOPT_URL, url );
  39. auto result = curl.perform();
  40. return data.tostring();
  41. }
  42. auto function testGetUrl()
  43. {
  44. auto url = "https://www.easycredito.me/";
  45. auto txt = getUrlHeadersData(url);
  46. print(txt);
  47. txt = getUrlData(url);
  48. print(txt);
  49. }
  50. //testGetUrl();
  51. auto checkLinks(url, host, links, links_visited)
  52. {
  53. auto txt = getUrlData(url);
  54. if(!txt || !txt.len()) return;
  55. print("txt", txt.len());
  56. auto host_re = host.replace(".", "%.");
  57. auto url_host = url.match("^[^/]*//[^/]+/");
  58. auto function setLink(mt, mlt, ml)
  59. {
  60. auto first_chr = ml[0];
  61. switch(first_chr)
  62. {
  63. case '"':
  64. case '\'':
  65. ml = ml.match(format("%c([^%c]+)%c", first_chr, first_chr, first_chr));
  66. if(ml)
  67. {
  68. if(!ml.match("//"))
  69. {
  70. if(ml[0] == '/')
  71. {
  72. ml = ml.slice(1);
  73. }
  74. ml = url_host + ml;
  75. }
  76. trset(links, ml, mt);
  77. }
  78. break;
  79. }
  80. return true;
  81. }
  82. txt.gmatch("<(%a+)%s+[^<>]*(src)=(%S+)", setLink);
  83. txt.gmatch("<(%a+)%s+[^<>]*(href)=(%S+)", setLink);
  84. /*
  85. txt.gmatch(
  86. "url=(%S+)",
  87. function(m)
  88. {
  89. auto first_chr = m[0];
  90. switch(first_chr)
  91. {
  92. case '"':
  93. case '\'':
  94. m = m.match(format("%c([^%c]+)%c", first_chr, first_chr, first_chr));
  95. break;
  96. }
  97. trset(links, m, true);
  98. return true;
  99. }
  100. );
  101. */
  102. trset(links_visited, url, true);
  103. foreach(k,v in links)
  104. {
  105. //print(v, k);
  106. auto wasVisited = trget(links_visited, k, false);
  107. if( (v == "a")
  108. && (k.endswith(".html"))
  109. && !wasVisited
  110. && k.match(host_re)
  111. )
  112. {
  113. print("***", v, k);
  114. checkLinks(k, host, links, links_visited);
  115. }
  116. else if(!wasVisited)
  117. {
  118. auto found = getUrlHeadersData(k);
  119. print(":::", v, k, found.len());
  120. trset(links_visited, k, found.len() > 0);
  121. }
  122. }
  123. //print(txt);
  124. }
  125. auto links = {};
  126. auto links_visited = {};
  127. checkLinks("https://www.easycredito.me/", "easycredito.me", links, links_visited);
  128. //checkLinks("http://mymvpblueprint.com/", "mymvpblueprint.com", links, links_visited);
  129. foreach(k,v in links) print(v, k);
  130. foreach(k,v in links_visited) print(v, k);