|
|
@@ -0,0 +1,144 @@
|
|
|
+auto trget = table_rawget;
|
|
|
+auto trset = table_rawset;
|
|
|
+
|
|
|
+int_t curl_reader(buffer, size, nitems, udata)
|
|
|
+{
|
|
|
+ //print("***Got", udata, buffer.len(), size, nitems, "\n", buffer, "\n");
|
|
|
+ udata.write(buffer);
|
|
|
+ return buffer.len();
|
|
|
+}
|
|
|
+
|
|
|
+auto curl = EasyCurl();
|
|
|
+//curl.set_writer(data);
|
|
|
+//curl.set_writer(curl_reader);
|
|
|
+
|
|
|
+//curl.setopt( curl.CURLOPT_VERBOSE, 1 );
|
|
|
+//accept compressed
|
|
|
+curl.setopt( curl.CURLOPT_ACCEPT_ENCODING, "" );
|
|
|
+//insecure
|
|
|
+//curl.setopt( curl.CURLOPT_SSL_VERIFYPEER, 0 );
|
|
|
+
|
|
|
+curl.setopt( curl.CURLOPT_FOLLOWLOCATION, 1);
|
|
|
+
|
|
|
+int_t curl_reader_header(buffer, size, nitems, udata)
|
|
|
+{
|
|
|
+ //print("***Got", udata, buffer.len(), size, nitems, "\n", buffer, "\n");
|
|
|
+ udata.write(buffer);
|
|
|
+ return buffer.len() == 2 ? 0 : buffer.len(); //2 == \n\n after headers
|
|
|
+}
|
|
|
+
|
|
|
+auto getUrlHeadersData(url)
|
|
|
+{
|
|
|
+ auto data = blob(0, 8192);
|
|
|
+ curl.set_writer(curl_reader_header, data);
|
|
|
+ curl.setopt( curl.CURLOPT_HEADER, 1);
|
|
|
+ curl.setopt( curl.CURLOPT_URL, url );
|
|
|
+ auto result = curl.perform();
|
|
|
+ curl.setopt( curl.CURLOPT_HEADER, 0);
|
|
|
+ return data.tostring();
|
|
|
+}
|
|
|
+
|
|
|
+auto getUrlData(url)
|
|
|
+{
|
|
|
+ auto data = blob(0, 8192);
|
|
|
+ curl.set_writer(curl_reader, data);
|
|
|
+ curl.setopt( curl.CURLOPT_URL, url );
|
|
|
+ auto result = curl.perform();
|
|
|
+ return data.tostring();
|
|
|
+}
|
|
|
+
|
|
|
+auto function testGetUrl()
|
|
|
+{
|
|
|
+ auto url = "https://www.easycredito.me/";
|
|
|
+ auto txt = getUrlHeadersData(url);
|
|
|
+ print(txt);
|
|
|
+ txt = getUrlData(url);
|
|
|
+ print(txt);
|
|
|
+}
|
|
|
+
|
|
|
+//testGetUrl();
|
|
|
+
|
|
|
+auto checkLinks(url, host, links, links_visited)
|
|
|
+{
|
|
|
+ auto txt = getUrlData(url);
|
|
|
+ if(!txt || !txt.len()) return;
|
|
|
+ print("txt", txt.len());
|
|
|
+ auto host_re = host.replace(".", "%.");
|
|
|
+ auto url_host = url.match("^[^/]*//[^/]+/");
|
|
|
+
|
|
|
+ auto function setLink(mt, mlt, ml)
|
|
|
+ {
|
|
|
+ auto first_chr = ml[0];
|
|
|
+ switch(first_chr)
|
|
|
+ {
|
|
|
+ case '"':
|
|
|
+ case '\'':
|
|
|
+ ml = ml.match(format("%c([^%c]+)%c", first_chr, first_chr, first_chr));
|
|
|
+ if(ml)
|
|
|
+ {
|
|
|
+ if(!ml.match("//"))
|
|
|
+ {
|
|
|
+ if(ml[0] == '/')
|
|
|
+ {
|
|
|
+ ml = ml.slice(1);
|
|
|
+ }
|
|
|
+ ml = url_host + ml;
|
|
|
+ }
|
|
|
+ trset(links, ml, mt);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ txt.gmatch("<(%a+)%s+[^<>]*(src)=(%S+)", setLink);
|
|
|
+ txt.gmatch("<(%a+)%s+[^<>]*(href)=(%S+)", setLink);
|
|
|
+ /*
|
|
|
+ txt.gmatch(
|
|
|
+ "url=(%S+)",
|
|
|
+ function(m)
|
|
|
+ {
|
|
|
+ auto first_chr = m[0];
|
|
|
+ switch(first_chr)
|
|
|
+ {
|
|
|
+ case '"':
|
|
|
+ case '\'':
|
|
|
+ m = m.match(format("%c([^%c]+)%c", first_chr, first_chr, first_chr));
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ trset(links, m, true);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ );
|
|
|
+ */
|
|
|
+ trset(links_visited, url, true);
|
|
|
+
|
|
|
+ foreach(k,v in links)
|
|
|
+ {
|
|
|
+ //print(v, k);
|
|
|
+ auto wasVisited = trget(links_visited, k, false);
|
|
|
+ if( (v == "a")
|
|
|
+ && (k.endswith(".html"))
|
|
|
+ && !wasVisited
|
|
|
+ && k.match(host_re)
|
|
|
+ )
|
|
|
+ {
|
|
|
+ print("***", v, k);
|
|
|
+ checkLinks(k, host, links, links_visited);
|
|
|
+ }
|
|
|
+ else if(!wasVisited)
|
|
|
+ {
|
|
|
+ auto found = getUrlHeadersData(k);
|
|
|
+ print(":::", v, k, found.len());
|
|
|
+ trset(links_visited, k, found.len() > 0);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //print(txt);
|
|
|
+}
|
|
|
+
|
|
|
+auto links = {};
|
|
|
+auto links_visited = {};
|
|
|
+checkLinks("https://www.easycredito.me/", "easycredito.me", links, links_visited);
|
|
|
+//checkLinks("http://mymvpblueprint.com/", "mymvpblueprint.com", links, links_visited);
|
|
|
+foreach(k,v in links) print(v, k);
|
|
|
+foreach(k,v in links_visited) print(v, k);
|