#include "bot.h"

/*

Copyright (C) 2011.  All rights reserved.
This software is the intellectual property of Matthew William Coan.


Author: Matthew William Coan
Date: Sat May  5 16:47:50 EDT 2012

*/

#include <iostream>
#include <fstream>
#include <map>
#include <list>
#include <string>

#include <cstring>
#include <cstdlib>
#include <ctime>

#include <unistd.h>
#include <signal.h>

#include <sys/types.h>
#include <sys/wait.h>

#include "tcp_stream.h"

#define USER_AGENT "TitleBot: V1.0.0"
#define SERVER_FILE "servers.txt"

using namespace std;
using namespace net_tools;

int fork_count = 0;


bool is_valid(const char * ip) {
   bool ret = false;

   if(system((string("ping -W 10 -o -c 1 ") + string(ip) + string(" > /dev/null")).c_str()) == 0) {
      ret = true;
   }
   else {
      ret = false;
   }

   return ret;
}

void 
on_SIGALRM(int arg) 
{
   cout << "SIGALRM..." << endl << flush;
}

void
on_SIGCHLD(int arg)
{
   cout << "SIGCHLD..." << endl << flush;
   fork_count--;
}

void
on_SIGINT(int arg)
{
   exit(0);
}

void
on_SIGTERM(int arg)
{
   exit(0);
}

void
on_SIGHUP(int arg)
{
   exit(0);
}

class WebRobot {
public:
   enum { MAX_FORK = 10 };
   typedef list< string > string_queue_type;
   typedef map< string, bool > url_map_type;
   typedef map< string, string > string_map_type;

private:
   string domain;
   string title;
   string user_agent;
   string_queue_type url_queue;
   string_map_type ua_map;
   url_map_type url_map;
   url_map_type my_map;
   ofstream out;

public:
   WebRobot(const string & url) {
      url_queue.push_back(url);
      url_map[url] = true;
      out.open(SERVER_FILE, ios::out | ios::ate | ios::trunc);
   }

   ~WebRobot() {
      out.close();
   }

   string get_title(const string & html) {
      string ret;
      //char ch;

      char * ptr = strcasestr(html.c_str(), "Server:");

      user_agent = "";

      if(ptr != NULL) {
         ptr += 8;
         while(*ptr != '\r' && *ptr != '\n' && *ptr) {
            user_agent += *ptr;
            ptr++;
         } 
      } 

cout << domain << " --> " << user_agent << endl << flush;

      ua_map[domain] = user_agent;

      ptr = strcasestr(html.c_str(), "<title>");

      if(ptr != NULL) {
         ptr += 7;
      } 
      else {
         return string("");
      }

      if(ptr) {
         while(*ptr && *ptr != '<') { 
            ret += *ptr;

            ptr++;
         }
      }

      return ret;
   }

   void find_URL(const string & html) {
      int count;
      char * ptr;
      ptr = strcasestr(html.c_str(), "href=");
      while(ptr != 0) {
         ptr += 5;
         if(*ptr == '\"' || *ptr == '\'') 
            ptr++;
         string temp;
         while(*ptr && *ptr != '<' && *ptr != '>' && *ptr != '\"' && *ptr != '\'') {
            temp += *ptr;
            ptr++;
         }
         if(url_map.find(temp) == url_map.end()) {
            if(temp[0] == '/')
               temp = "http://" + domain + temp; 

            count = 0;

            for(size_t i = 0; i < temp.size(); i++) {
               if(temp[i] == '/') {
                  count++;
               }
            }
 
            if(count <= 5) {
               url_queue.push_back(temp);
            }
            url_map[temp] = true;
         }
         ptr = strcasestr(ptr, "href=");
      }
   }

   string filter_nl(const string & str) {
      string ret;

      for(size_t i = 0; i < str.size(); i++) {
         if(str[i] != '\n' && str[i] != '\r') {
            ret += str[i];
         }
         else {
            ret += " ";
         }
      }

      return ret;
   }

   bool download() {
      //enum { IP_SIZE = 256 };
      //char ip[IP_SIZE];
      bool ret = false;

      //sprintf(ip, "%d.%d.%d.%d", a, b, c, d);
      //strcpy(ip, "74.125.228.8");

      string temp_ip = url_queue.front();
      url_queue.pop_front();
      const char * ip = temp_ip.c_str();
      string addr;
cout << temp_ip << endl << flush;
      size_t i;
      for(i = 7; i < temp_ip.size(); i++) {
         if(temp_ip[i] == '/') break;
         addr += temp_ip[i];
      }
      string uri;
      while(i < temp_ip.size()) {
         uri += temp_ip[i];
         i++;
      }
      if(uri == "") uri = "/";
      domain = addr;
      ip = domain.c_str();

//cout << ip << endl << flush;

      if(!is_valid(ip)) {
         return false;
      }

      fork_count++;

      //int pid = 0;

      //if((pid = fork()) == 0) {
         alarm(3);

         tcp_stream sock(ip, 80);

         alarm(0);

         if(sock) {
//cout << "connected: \"" << ip << "\"" << endl << flush;

            sock << "GET " << uri << " HTTP/1.0\r\n"
                 << "User-Agent: " << USER_AGENT << "\r\n" 
                 << "\r\n";

            char ch;
            string web_page;
            string title;

            ch = sock.get();

            while(sock) {
               web_page += ch;

               ch = sock.get();
            }

            sock.close();

            if(web_page.size()) {
               //cout << "--BEGIN--" << endl << flush;
               //cout << web_page << endl << flush;
               //cout << "--END--" << endl << flush;
               title = get_title(web_page); 
               find_URL(web_page);
               string url = string("http://") + domain + string("/");
               if(my_map.find(url) == my_map.end()) {
                  my_map[url] = true;
                  ret = true;
                  web_page = "";

                  //out.open("servers.txt", ios::out | ios::ate | ios::app);

                  if(out) {
                     out << "\"" << url << "\",\"" << filter_nl(title) << "\",\"" << ua_map[domain] << "\"" << endl << flush;
                     cout << "\"" << url << "\",\"" << filter_nl(title) << "\",\"" << ua_map[domain] << "\"" << endl << flush;

                     //out.close();
                  }
               }
            }
         }
         //exit(0);
      //}
 
      if(fork_count >= MAX_FORK) {
         //waitpid(pid,0,0);
      }

      return ret;
   }

   void run() {
      signal(SIGALRM, on_SIGALRM);
      signal(SIGCHLD, on_SIGCHLD);
      signal(SIGINT, on_SIGINT);
      signal(SIGTERM, on_SIGTERM);
      signal(SIGHUP, on_SIGHUP);

      while(url_queue.size()) {
         if(download()) {
            cout << "got page..." << endl << flush;
         }
      }
   }
};

int
main(int argc, char ** argv, char ** envp)
{
   if(argc != 2) {
      cerr << "usage: " << argv[0] << " <start-domain-name>" << endl;
      exit(1);
   }

   WebRobot bot(argv[1]);

   bot.run();

   return 0;
}


