/*

(C) Copyright 2011.  All rights reserved.
Intellectual property of Matthew William Coan.

C++ web robot...

Author: Matthew William Coan
Date: Sun Aug  7 23:37:16 EDT 2011

*/

#include <iostream>
#include <fstream>

#include <map>
#include <list>
#include <string>

#include <cctype>

#include <unistd.h>

#include "thread.h"
#include "tcp_stream.h"

using namespace std;
using namespace thread_tools;
using namespace net_tools;

#define MAX_PAGE_SIZE (1024 * 1024)

class Bot : public Thread {
public:
   typedef list< string > string_list_type;

private:
   Queue< string > * _queue;
   string_list_type * _visit;
   ofstream * _out;
   Mutex * _mutex0;
   Mutex * _mutex1;

public:
   Bot(const string start_url, 
       Queue< string > * p, 
       ofstream * out,
       Mutex * m0,
       Mutex * m1,
       string_list_type * visit) {
      _queue = p;
      _out = out;
      _mutex0 = m0;
      _mutex1 = m1;
      _visit = visit;
      push(start_url);
   }

   ~Bot() { }

   string pop() {
      return _queue->get();
   }

   bool visit(const string & url) {
      bool ret = false;
      _mutex1->lock();

      for(string_list_type::iterator ptr = _visit->begin(); ptr != _visit->end(); ptr++) {
         if(url == (*ptr)) {
            ret = true;
            break;
         }
      }

      _mutex1->unlock();

      return ret;
   }

   void push(const string surl) {
      string url = "http://" + get_domain(surl) + "/";
      if(!visit(url)) {
cout << "new-url(" << url << ")" << endl << flush;
         _queue->put(url);
         _mutex1->lock();
         _visit->push_back(url);
         _mutex1->unlock();
      }
   }

   string get_uri(const string & url) {
      string ret;
      size_t count = 0;

      for(size_t i = 0; i < url.size(); i++) {
         if(url[i] == '/')
            count++;

         if(url[i] == ':' || url[i] == '<' || url[i] == '>') {
            ret += "/";
            break;
         }

         if(count >= 3)
            ret += url[i];
      }

      return ret;
   }

   string get_domain(const string & url) {
      string ret;

      for(size_t i = 7; i < url.size() && url[i] != '/' && url[i] != ':'; i++) {
         if((!isalnum(url[i]) && !ispunct(url[i])) 
            || url[i] == '<' || url[i] == '>' || url[i] == '\"' || url[i] == '\'')
            break;
         ret += url[i];
      }

      return ret;
   }

   string get_page(const string & url) {
cout << "get_page(" << url << ")" << endl << flush;
      tcp_stream http(get_domain(url).c_str(), 80);
      string ret;

      if(http) {
         string request = "GET / HTTP/1.0\r\n\r\n";
         http << request;
         char ch;
         ch = http.get();
         while(http) {
            ret += ch;
            if(ret.size() >= MAX_PAGE_SIZE)
               break;
            ch = http.get();
         }
         if(ret.find("text/html") != string::npos) { 
            if(ret.find("402") == string::npos 
               && ret.find("200") == string::npos) {
               ret = "";
            }
         }
         else 
            ret = "";
         http.close();
      }

      return ret;
   }

   void extract_url(const string & page) {
      size_t ptr = page.find("http://");
      size_t ptr2;
      string url;
      while(ptr < page.size()) {
         ptr2 = ptr;
         while(page[ptr2] != '\0' && page[ptr2] != '\"' && page[ptr2] != '\'' 
               && page[ptr2] != ' ' && page[ptr2] != '\t' && page[ptr2] != '\r' && page[ptr2] != '\n') {
            url += page[ptr2];
            ptr2++;
         }
         push(url);
         url = "";
         ptr = page.find("http://", ptr+7);
      }
   }

   string strip(const string & html) {
      string ret;
      bool in = false;
      for(size_t i = 0; i < html.size(); i++) {
         if(html[i] == '<')
            in = true;
         else if(html[i] == '>') {
            in = false;
            ret += ' ';
         }
         else if(!in && (isalnum(html[i]) || ispunct(html[i]) || isspace(html[i]))) {
            if(html[i] == '\n' || html[i] == '\r')
               ret += " ";
            else
               ret += html[i];
         }
      } 
      return ret;
   }

   string get_title(const string & page) {
      string title;

      size_t start = page.find("<title>");
      size_t end = page.find("</title>");

      if(start != string::npos && end != string::npos) {
         start += 7;
         for(size_t i = start; i < end; i++)
            title += page[i];
      }

      return strip(title);
   }

   string get_user_agent(const string & page) {
      string ua;

      size_t start = page.find("Server: ");
      size_t end = start;
      if(start != string::npos) {
         start += 8;
         end += 8;
         while(page[end] != '\r' && page[end] != '\n') {
            end++;
            if(end >= page.size())
               break;
         }
         ua = page.substr(start, end - start);
      }

      return ua;
   }

   void run() {
      while(true) {
         string url = pop();
         string page = get_page(url);
         string title = get_title(page);
         string user_agent = get_user_agent(page);
         _mutex0->lock();
         (*_out) << "\"http://" << get_domain(url) 
                 << "/\",\"" << title << "\",\"" 
                 << user_agent << "\"" << endl << flush;
         _mutex0->unlock();
         extract_url(page);
      }
   }
};

int
main(int argc, char ** argv, char ** envp)
{
   if(argc != 2) {
      cerr << "usage: " << argv[0] << " <start-URL>" << endl << flush;
      exit(0);
   }

   Bot::string_list_type the_visit;
   Queue< string > the_queue;
   Mutex mutex0;
   Mutex mutex1;

   ofstream out("output.txt", ios::out | ios::ate | ios::out);

   Bot bot1(argv[1], &the_queue, &out, &mutex0, &mutex1, &the_visit);
   Bot bot2(argv[1], &the_queue, &out, &mutex0, &mutex1, &the_visit);
   Bot bot3(argv[1], &the_queue, &out, &mutex0, &mutex1, &the_visit);
   Bot bot4(argv[1], &the_queue, &out, &mutex0, &mutex1, &the_visit);
   Bot bot5(argv[1], &the_queue, &out, &mutex0, &mutex1, &the_visit);

   bot1.start();
   bot2.start();
   bot3.start();
   bot4.start();
   bot5.start();

   bot5.join();
   bot4.join();
   bot3.join();
   bot2.join();
   bot1.join();

   out.close();

   return 0;
}
