#ifndef _TITLE_BOT_H
#define _TITLE_BOT_H

/*

WEB ROBOT CLASS...

Author: Matthew W. Coan
Date: Wed Aug 14 12:26:27 EDT 2013

*/

#include <iostream>
#include <fstream>

#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <cctype>

#include <vector>
#include <list>
#include <string>

#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <signal.h>

#include "d_db.h"
#include "d_alloc.h"
#include "d_list.h"

#include "tcp_stream.h"

#include "btree2.h"

#define DEFAULT_URL_DB   "/home/mcoan/title-bot/url.db"
#define DEFAULT_TEMP_DIR "/home/mcoan/title-bot/temp"
#define URL_FILE         "/home/mcoan/title-bot/title-bot.txt"
#define LIST_FILE        "/home/mcoan/title-bot/list.dat"
#define WAIT_TIMEOUT     "3"
#define BOT_NAME         "TitleBot-1.0"
#define TIMEOUT          3 
#define FORK_TIMEOUT	 3 
#define N_FORK           3 
#define N_FORK_MAX	 100
#define MAX_PAGE_SIZE    (1024*10)
#define MAX_TITLE        256

namespace title_bot {

using namespace net_tools;
using namespace db_tools;
using namespace std;

void
on_SIGCHLD(int arg)
{
}

void
on_SIGALRM(int arg)
{
   exit(0);
}

class URL {
public:
   string first;
   string second;

   URL() { } 
   URL(const string & f, const string & s) : first(f), second(s) { } 
   URL(const URL & url) {
      first = url.first;
      second = url.second;
   }
   ~URL() { }

   URL & operator=(const URL & url) {
      first = url.first;
      second = url.second;
      return *this;
   }
};

inline
bfstream & operator<<(bfstream & out, const URL & url) {
   out << url.first << url.second;
   return out;
}

inline
bfstream & operator>>(bfstream & in, URL & url) {
   in >> url.first >> url.second;
   return in;
}

typedef offset_type (*URL_size_type)(const URL & url);

offset_type URL_size(const URL & url) {
   return sizeof(size_t) + url.first.size() + 1 
          + sizeof(size_t) + url.second.size() + 1;
}

class Robot {
public:
   typedef d_list< URL, URL_size_type > queue_list_type;
   typedef list< int > pid_list_type;
   typedef btree visited_map_type;

private:
   queue_list_type * queue;
   string start;
   unsigned long file_count;
   visited_map_type visited;
   bool init;

public:
   Robot(const string & the_start, bool init) 
   :start(the_start), visited(DEFAULT_URL_DB) {
      file_count = 0UL;
      this->init = init;
   }

   void put(const string & url, const string & file) {
      queue->push_back(URL(url,file));
   }

   URL get() {
      URL ret = queue->front();
      queue->pop_front();
      return ret;
   }

   bool is_url_char(const char ch) {
      bool ret = false;
      if(isalpha(ch) 
         || ch == '/'
         || ch == '?' 
         || ch == '='
         || ch == '.' 
         || ch == '&'
         || ch == '-' 
         || ch == '_'
         || isdigit(ch) 
         || ch == '+'
         || ch == '%' 
         || ch == '~'
         || ch == '@') {
         ret = true;
      }
           
      return ret;
   }

   string get_root_document(const string & url) {
      string ret;
      int count = 0;
      for(size_t i = 0; i < url.size(); i++) {
         if(url[i] == '/')
            count++;
         if(count == 3) 
            break;
         ret += url[i];
           
      }
      return ret;
   }

   void process(const string & data) {
      size_t i = 0;
      string url;

      while((i = data.find("http://", i)) != string::npos) {
         url = "http://";
         i += 7;
         while(is_url_char(data[i])) {
            url += data[i];
            i++;
         }
         url = get_root_document(url);
         if(visited[url] == false) {
            visited[url] = true;
            cout << "FOUND URL: [" << url << "]" << endl;
            put(url, get_next_temp_file());
         }
      }
   }

   string strip_tags(const string & data) {
      string temp;
      bool in = false;
      for(size_t i = 0; i < data.size(); i++) {
         if(data[i] == '<') {
            in = true;
         }
         else if(data[i] == '>') {
            in = false;
         }
         else if(!in) {
            temp += data[i];
         }
      }
      return temp;
   }

   bool is_title_char(const char ch) {
      bool ret = false;
      if(ch == ' ' || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')
         || ch == '<' || ch == '/' || ch == '>'
         || (ch >= '0' && ch <= '9') || ch == '-' || ch == '.' || ch == ':'
         || ch == '+') {
         ret = true;
      }
      return ret;
   }

   string get_title(const string & the_data) {
      string ret;
      string data = the_data;
      if(data.size()) {
         for(size_t i = 0; i < data.size(); i++) {
            data[i] = tolower(data[i]);
         }
         size_t ptr1 = data.find("<title>");
         size_t ptr2 = data.find("</title>");
         if(ptr1 != string::npos && ptr2 != string::npos && ptr1 < ptr2) {
            for(size_t p = ptr1; p != ptr2; p++) {
               if(is_title_char(the_data[p])) {
                  ret += the_data[p];
                  if(ret.size() >= MAX_TITLE) break;
               }
            }
         }
      }
      return strip_tags(ret);
   }

   void process(const URL & temp) {
      cout << "PROCESS: " << temp.first << endl;
      cout << "FILE: " << temp.second << endl;
      string data;
      char ch;
      string title,user_agent;
      ifstream fin(temp.second.c_str());
      if(fin) {
         ch = fin.get();
         while(fin) {
            data += ch;
            ch = fin.get();
         }
         fin.close();
      }
      if(data.size()) {
         if(data.find("text/html") != string::npos 
            || data.find("text/plain") != string::npos
            || data.find("302 Found") != string::npos
            || data.find("200 OK") != string::npos) {
            size_t ptr = data.find("Server: ");
            if(ptr != string::npos) {
               ptr += 8;
               while(data[ptr] != '\n' && data[ptr] != '\r') {
                  user_agent += data[ptr];
                  ptr++;
               }
            }
            title = get_title(data);
            process(data);
         }
      }
      ofstream fout(URL_FILE, ios::ate | ios::app);
      if(fout) { 
         if(temp.first.find("/", 7) == string::npos) {
            fout << "\"" << temp.first << "\",\"" << title << "\",\"" << user_agent << "\"" << endl;
         }
         fout.close();
      }
      unlink(temp.second.c_str());
   }

   int process(list< URL > & temp_list, int sub) {
      int n = 0;
      for(list< URL >::iterator ptr = temp_list.begin(); ptr != temp_list.end(); ptr++) {
         n++;
         process(URL(ptr->first, ptr->second));
         if(n >= sub) 
            break;
      }
      for(int i = 0; i < n; i++) {
         temp_list.pop_front();
      }
      return n;
   }

   string get_host(const string & url) {
      string ret;
      for(size_t i = 7; i < url.size(); i++) {
         if(url[i] != ':' && url[i] != '/') {
            ret += url[i];
         }
         else {
            break;
         }
      }
      return ret;
   }

   string get_URI(const string & url) {
      string ret;
      size_t i;
      for(i = 7; i < url.size(); i++) {
         if(url[i] == ':' || url[i] == '/' || url[i] == '\?' || url[i] == '&') {
            break;
         }
      }
      if(url[i] == ':') {
         i++;
      }
      while(i < url.size()) {
         ret += url[i];
         i++;
      }
      if(ret == "") ret = "/";
      return ret;
   }

   void get_page0(tcp_stream & tcp, const string & host, const string & uri, const string & file) {
      signal(SIGALRM, on_SIGALRM);
      //alarm(TIMEOUT);
      string request = string("GET ") + uri + string(" HTTP/1.1\r\n")
                       + string("User-Agent: ") + string(BOT_NAME) + string("\r\n")
                       + string("Host: ") + host + string("\r\n")
                       + string("Accept: text/plain, text/html") + string("\r\n")
                       + string("Connection: close")
                       + string("\r\n\r\n");
      char ch;
      ofstream out(file.c_str(), ios::ate | ios::trunc | ios::out);
      tcp << request;
      if(out) {
         string buffer, buffer2;
         size_t the_size = 0;
         ch = tcp.get();
         while(tcp) {
            the_size++;
            if(the_size >= MAX_PAGE_SIZE)
               break; 
            out << ch << flush;
            ch = tcp.get();
         }
         out.close();
      }
      tcp.close();
      //alarm(0);
   }

   void get_page(const string & url, const string & file, bool wget = true) {
      if(wget == false) {
         string host = get_host(url);
         string uri = get_URI(url);
         alarm(TIMEOUT);
         tcp_stream tcp(host.c_str(), 80);
         if(tcp) {
            get_page0(tcp, host, uri, file);
         }
         alarm(0);
      }
      else {
         int pid;
         signal(SIGALRM, on_SIGALRM);
         alarm(TIMEOUT);
         if((pid = fork()) == 0) {
            execlp("wget", "wget", "-Q10k", "-q", "-U", BOT_NAME, "-t", "1", "--no-check-certificate", "--save-headers",
                   "-A", ".html .htm .asp .aspx .php .jsp .pl .cgi .txt", "-T3", 
                   url.c_str(), "-O", file.c_str(),
                   NULL);
         }
         else {
            int status = 0;
            waitpid(pid, &status, 0);
         }
         alarm(0);
      }
   }

   string get_next_temp_file() {
      char buffer[1024];
      sprintf(buffer, "%s/temp_%ld.txt", DEFAULT_TEMP_DIR, file_count);
      file_count++;
      return buffer;
   }

   void run() {
      int status = 0;
      int count = 0;
      int pid;
      URL temp;
      string url,file;
      pid_list_type pid_list;
      list< URL > temp_list;

      signal(SIGCHLD, on_SIGCHLD);

      if(init) {
         unlink(LIST_FILE);
 
         ofstream(LIST_FILE, ios::ate).close();
      }

      d_database db(LIST_FILE);

      p_db = &db;

      if(init) {
         db.allocate(sizeof(offset_type)
                     + sizeof(offset_type)
                     + sizeof(offset_type)
                     + sizeof(offset_type));
      }

      d_list< URL, URL_size_type > url_list(db, sizeof(offset_type), URL_size);

      queue = &url_list;

      file = get_next_temp_file();

      temp.first = start;
      temp.second = file;

      queue->push_back(temp);

      while(queue->size() || pid_list.size()) {
         count = 0;
         while(queue->size() && count < N_FORK && pid_list.size() < N_FORK_MAX) {
            temp = get();

            temp_list.push_back(temp);

            if((pid = fork()) == 0) {
               cout << "GET: " << temp.first << endl;
               get_page(temp.first,temp.second);
               exit(0);
            }
            else if(pid < 0) {
               cout << "error: can't fork()..." << endl;
               sleep(FORK_TIMEOUT);
               while(waitpid(pid_list.front(), &status, 0) > 0) {
                  temp = temp_list.front();
                  temp_list.pop_front();
                  process(temp);
                  pid_list.pop_front();
                  if(pid_list.size() < N_FORK_MAX) {
                     break;
                  }
               }
               continue;
            }
            else {
               pid_list.push_back(pid);
            }
            count++;
         }

         if(pid_list.size()) {
            if(queue->size() == 0) {
               while(queue->size() == 0) {
                  if(waitpid(pid_list.front(), &status, 0) > 0) {
                     temp = temp_list.front();
                     temp_list.pop_front();
                     process(temp);
                     pid_list.pop_front();
                     if(pid_list.size() == 0) {
                        break;
                     }
                  }
               }
            }
            else {
               bool got = false;
               if(waitpid(pid_list.front(), &status, 0) > 0) {
                  got = true;
                  temp = temp_list.front();
                  temp_list.pop_front();
                  process(temp);
                  pid_list.pop_front();
                  if(temp_list.size() == 0 || pid_list.size() == 0) {
                     break;
                  }
                  //if(pid_list.size() == 0) break;
               }
/*
               if(!got) {
                  if(waitpid(pid_list.front(), &status, 0) > 0) {
                     got = true;
                     temp = temp_list.front();
                     temp_list.pop_front();
                     process(temp);
                     pid_list.pop_front();
                     if(temp_list.size() == 0 || pid_list.size() == 0) {
                         break;
                     }
                  }
               }
*/
            }
         }
      }
   }

   size_t size() { 
      return queue->size(); 
   }
};

}

#endif /* _TITLE_BOT_H  */
