#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
#include <algorithm>

#include <cstdlib>

#include <signal.h>
#include <unistd.h>
#include <sys/wait.h>

using namespace std;

typedef list< string > string_list_type;
typedef list< char > char_list_type;
typedef map< string, bool > url_map_type;

//#define WGET_CMD "/usr/pkg/bin/wget"
#define WGET_CMD "/home/mcoan/www-robot/wget"
#define OUTPUT_FILE "/home/mcoan/www-robot/temp.html"
#define USER_AGENT "Server: "
#define MAX_FORK 10

#define MAX_PAGE_SIZE (1024 * 100)
#define MAX_PAGE_SIZE_STR "100k"

int fork_count = 0;

void
on_SIGNAL(int the_signal) 
{
   switch(the_signal) {
      case SIGCHLD:
         break;

      case SIGINT:
      case SIGTERM:
      case SIGHUP:
         exit(EXIT_SUCCESS);
         break;
   }
}

void
get_url_value(char * & url, string_list_type & url_list, url_map_type & url_map, ofstream & fout)
{
   string url_value;

   for(;*url != '\0' && *url != '\"' && *url != '\'' && *url != '<' 
       && *url != '>' && *url != ' ' && *url != '\n' && *url != '\r' && *url != '\t'; url++) {
      url_value += (*url);
   }

   if(url_map.find(url_value) == url_map.end()) {
      if(url_value.find(".jpeg") == string::npos
         && url_value.find(".jpg") == string::npos
         && url_value.find(".gif") == string::npos
         && url_value.find(".png") == string::npos) {
         url_map[url_value] = false;

         url_list.push_back(url_value);
      }
   }
}

void
get_URL(char * buffer, string_list_type & url_list, url_map_type & url_map, ofstream & fout)
{
   char * url = strstr(buffer, "http://");
   while(url != NULL) {
      get_url_value(url, url_list, url_map, fout);
      url = strstr(url, "http://");
   }
}

string
strip_HTML(const string & html)
{
   string ret;
   bool in_tag = false;

   for(size_t i = 0; i < html.size(); i++) {
      if(html[i] == '<') {
         in_tag = true;
         continue;
      }
      else if(html[i] == '>') {
         in_tag = false;
         continue;
      }

      if(!in_tag) {
         if(html[i] == '\r' || html[i] == '\n')
            ret += " ";
         else
            ret += html[i];
      }
   }

   char * buffer = new char[ret.size()+1];

   memset(buffer, 0, ret.size()+1);

   strcpy(buffer, ret.c_str());

   char * ptr = strtok(buffer, " \r\n\t");

   string temp;
 
   while(ptr) {
      if(temp.size() > 0)
         temp += " "; 

      temp += string(ptr);

      ptr = strtok(NULL, " \r\n\t");
   }

   ret = temp;

   delete [] buffer;

   return ret;
}

void
process_webpage(const char * file_name, 
                string_list_type & url_list,
                url_map_type & url_map,
                ofstream & fout,
                const string & url)
{
   ifstream fin(file_name, ios::in);

   if(fin) {
      char ch;

      char_list_type ch_list;

      ch = fin.get();

      while(fin) {
         ch_list.push_back(ch);

         if(ch_list.size() >= MAX_PAGE_SIZE) {
            break;
         }

         ch = fin.get();
      }

      fin.close();

      size_t size = ch_list.size();
      char * buffer = new char[size+1];
      memset(buffer, 0, size+1);

      size_t i = 0;

      for(char_list_type::iterator ptr = ch_list.begin(); ptr != ch_list.end(); ptr++) {
         buffer[i] = (*ptr);
         i++;
      }

      if(strstr(buffer, "200 OK") != NULL && strstr(buffer, "text/html") != NULL) {
         get_URL(buffer, url_list, url_map, fout);

         string title, user_agent;

         char * ptr = strstr(buffer, USER_AGENT);

         if(ptr) {
            ptr += strlen(USER_AGENT);

            while(*ptr != '\r' && *ptr != '\n') {
               user_agent += *ptr;

               ptr++;
            }
         }

         ptr = strcasestr(buffer, "<title>");

         if(ptr) {
            ptr += 7;

            char * ptr0 = strcasestr(buffer, "</title>");

            if(ptr0) {
               title = "";

               while(ptr != ptr0) {
                  title += *ptr;

                  ptr++;
               }
            }
         }

         title = strip_HTML(title);
  
         if(url_map[url] == false) {
            fout << "\"" << url << "\",\"" << title << "\",\"" << user_agent << "\"" << endl << flush;

            url_map[url] = true;
         }
      }

      delete [] buffer;
   }
}

string
to_string(const long & l)
{
   char buffer[1024];
   memset(buffer, 0, 1024);
   sprintf(buffer, "%ld", l);
   return string(buffer);
}

int
run_robot(const char * start_url)
{
   int ret = 0;
   long count = 0L;
   string cmd, url, temp_file;
   string_list_type url_list;
   url_map_type url_map;
   
   url_list.push_back(string(start_url));

   ofstream fout("www-titles.csv", ios::out | ios::trunc);
  
   for(int i = 0; i < MAX_FORK; i++)
      if(vfork() == 0)
         break;
   
   if(fout) {
      while(url_list.size()) {
         url = url_list.front();
         url_list.pop_front();

         if(url_map[url] == true)
            continue;

         temp_file = "temp/temp-page" + to_string(count) + ".html";

         count++;
      
         cmd = string(WGET_CMD) + string(" \"") + url + string("\" > ") + temp_file;

         cout << "CMD: " << cmd << endl << flush;

         if(system(cmd.c_str()) == 0) {
            process_webpage(temp_file.c_str(), url_list, url_map, fout, url);

            unlink(temp_file.c_str());
         }
      }
 
      fout.close();
   }

   return ret;
}

int
main(int argc,
     char ** argv,
     char ** envp)
{
   signal(SIGTERM, on_SIGNAL);
   signal(SIGINT, on_SIGNAL);
   signal(SIGHUP, on_SIGNAL);
   signal(SIGCHLD, on_SIGNAL);

   cout << "start robot..." << endl << flush;

   int rc = run_robot(argv[1]);

   cout << "end robot..." << endl << flush;

   return rc;
}
