#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <list>
#include <map>
#include <algorithm>

#include <cstring>

#include <sys/wait.h>
#include <sys/types.h>
#include <signal.h>

#include <fcntl.h>
#include <unistd.h>

#include "tcp_stream.h"

using namespace std;
using namespace net_tools;

#define MAX_PAGE_SIZE (1024 * 1024)
#define MAX_THREADS 15
#define MAX_URL_SIZE 1024

typedef list< string > string_list_type;
typedef list< char > char_list_type;
typedef map< string, bool > bool_map_type;

int robot_count = 0;

class robot_file {
public:
   typedef map< string, string_list_type > access_map_type;

   string domain_name;
   access_map_type allow_map;
   access_map_type disallow_map;

   robot_file() { }

   robot_file(const robot_file & cp) {
      domain_name = cp.domain_name;
      allow_map = cp.allow_map;
      disallow_map = cp.disallow_map;
   }

   ~robot_file() { }

   robot_file & operator=(const robot_file & right)  {
      domain_name = right.domain_name;
      allow_map = right.allow_map;
      disallow_map = right.disallow_map;
      return *this;
   }

   void allow(const string & user_agent,
              const string & uri) {
      allow_map[user_agent].push_back(uri);
   }

   void disallow(const string & user_agent,
                 const string & uri) {
      disallow_map[user_agent].push_back(uri);
   }

   bool match(const string & value,
              const string & pattern) {
      bool ret = false;

      if(value.size() < pattern.size()
         || value.size() == 0 
         || pattern.size() == 0) {
         return ret;
      }

      size_t i = 0U;
      size_t j = 0U;

      while(value[i] == pattern[j] || pattern[j] == '*') {
         if(pattern[j] == '*' && value[i] == '/') {
            i++;
            j++;
         }
         else if(pattern[j] == '*') {
            i++; 
         }
         else {
            i++;
            j++;
         }

         if(i >= value.size() || j >= pattern.size()) {
            break;
         }
      }

      if(j >= pattern.size()) {
         ret = true;
      }

      return ret;
   }

   bool allow(const string & uri) {
      for(access_map_type::iterator p = allow_map.begin();  
         p != allow_map.end(); p++) {
         if(p->first == "*" || p->first == "TitleBot") { 
            for(string_list_type::iterator pp = p->second.begin();
                pp != p->second.end(); pp++) {
               if(match(uri, (*pp))) {
                  return true;
               }
            }
         }
      }
     
      for(access_map_type::iterator p = disallow_map.begin();
         p != disallow_map.end(); p++) {
         if(p->first == "*" || p->first == "TitleBot") { 
            for(string_list_type::iterator pp = p->second.begin();
                pp != p->second.end(); pp++) {
               if(match(uri, (*pp))) {
                  return false;
               }
            }
         }
      }

      return true;
   }
};

typedef map< string, robot_file* > robot_file_map_type;

int the_count;

class robot {
   ofstream * out;
   string title_string;
   string version_string;
   string file_name;
   string url;
   bool reading;
   int fd[2];
   int lock_fd0;
   int lock_fd1;
   int lock_fd2;

public:
   robot(int the_fd[2],
         int the_lock_fd0,
         int the_lock_fd1,
         int the_lock_fd2,
         const string & url,
         ofstream & the_out) {
      this->url = url;
      out = &the_out;
      reading = false;
      fd[0] = the_fd[0];
      fd[1] = the_fd[1];
      lock_fd0 = the_lock_fd0;
      lock_fd1 = the_lock_fd1;
      lock_fd2 = the_lock_fd2;
   }

   robot(const robot & r)
   {
      url = r.url;
      out = r.out;
      lock_fd0 = r.lock_fd0;
      lock_fd1 = r.lock_fd1;
      lock_fd2 = r.lock_fd2;
   }

   ~robot() { 
      out->flush();
      close(lock_fd0);
      close(lock_fd1);
      close(lock_fd2);
   } 

   void lock(int lock_fd, int mode = LOCK_EX) {
      flock(lock_fd, mode);
   }

   void unlock(int lock_fd, int mode = LOCK_UN) {
      flock(lock_fd, mode);
   }

   string get_domain_name(const string & url) {
      string domain_name;

      for(size_t i = 7; i < url.size(); i++) {
         if(url[i] == '/' || url[i] == ':') {
            break;
         }
         domain_name += url[i];
      }

      return domain_name;
   }

   robot_file * get_robot_file(const string & url) {
      char * buf;

      char ch;

      string buffer;

      robot_file * ret = 0;

      string the_domain = get_domain_name(url);

      string user_agent = "TitleBot", uri = "/";

      robot_file * p_robot_file = new robot_file();

      ret = p_robot_file;

      string file = "data/" + the_domain + ".robots.txt";

      bool found = false;

      if(!ifstream(file.c_str())) {
         tcp_stream stream(the_domain.c_str(), 80);

         if(stream) {
            string request = "GET /robots.txt HTTP/1.0\r\nUser-agent: TitleBot\r\n\r\n";

            stream << request;

            ch = stream.get();

            ofstream fout(file.c_str(), ios::ate);

            if(fout) {
               string temp;

               while(stream) {
                  temp += ch;

                  ch = stream.get();

                  if(ch == '\n') {
                     if(strcasestr(temp.c_str(), "302 Found") != NULL
                        || strcasestr(temp.c_str(), "200 OK") != NULL) {
                        found = true;
                     }

                     fout << temp << flush;

                     temp = "";
                  }
               }

               fout.close();

               if(!found) {
                  delete p_robot_file;
 
                  return 0;
               }
            }

            stream.close();
         }
         else {
            delete p_robot_file;

            return 0;
         }
      }

      ifstream stream(("data/" + the_domain + ".robots.txt").c_str());

      if(stream) {
         ch = stream.get();

         while(stream) {
            if(ch == '\n') {
               buf = new char[buffer.size()+1];

               memset(buf, 0, buffer.size()+1);

               copy(buffer.begin(), buffer.end(), buf);

               if(strcasestr(buf, "302 Found") != NULL
                  || strcasestr(buf, "200 OK") != NULL) {
                  break;
               }

               if(strcasestr(buf, "Disallow:") == buf) {
                  uri = string(buf + 10);
                  p_robot_file->disallow(user_agent, uri);
               }
               else if(strcasestr(buf, "Allow:") == buf) {
                  uri = string(buf + 7);
                  p_robot_file->allow(user_agent, uri);
               }
               else if(strcasestr(buf, "User-agent:") == buf) {
                  user_agent = string(buf + 11);
               }

               delete [] buf;

               buffer.clear();
            }
            else {
               buffer += ch;
            }

            ch = stream.get();
         }

         stream.close();
      }

      return ret;
   }

   void write_URL(const string & the_url) {

      if(strcasestr(the_url.c_str(), ".html") == NULL
         && strcasestr(the_url.c_str(), ".htm") == NULL
         && strcasestr(the_url.c_str(), ".jsp") == NULL
         && strcasestr(the_url.c_str(), ".php") == NULL
         && strcasestr(the_url.c_str(), ".php3") == NULL
         && strcasestr(the_url.c_str(), ".php4") == NULL
         && strcasestr(the_url.c_str(), ".php5") == NULL
         && strcasestr(the_url.c_str(), ".asp") == NULL
         && strcasestr(the_url.c_str(), ".aspx") == NULL
         && strcasestr(the_url.c_str(), ".cgi") == NULL
         && strcasestr(the_url.c_str(), ".exe") == NULL
         && strcasestr(the_url.c_str(), ".dll") == NULL
         && strcasestr(the_url.c_str(), ".cfm") == NULL
         && strcasestr(the_url.c_str(), ".pl") == NULL
         && strcasestr(the_url.c_str(), ".py") == NULL
         && strcasestr(the_url.c_str(), "?") == NULL
         && get_URI(the_url) != "/"
         && get_URI(the_url) != "") {
         return;
      }

      lock(lock_fd0);

      bool found = false;

      ifstream fin("url_list.txt");

      if(fin) {
         string url;

         fin >> url;

         while(fin) {
            if(url == the_url) {
               found = true;

               break;
            }

            fin >> url;
         }

         fin.close();
      }

      if(!found) {
         ofstream fout("url_list.txt", ios::app);

         if(fout) {
            fout << the_url << endl << flush;

            fout.close();
         }

         robot_file * p_robot = get_robot_file(the_url);

         string temp = the_url + "\n";

         if(p_robot != 0) {
            if(p_robot->allow(get_URI(the_url))) {
               write(fd[1], temp.c_str(), temp.size());
            }
         }
         else {
            write(fd[1], temp.c_str(), temp.size());
         }

         unlock(lock_fd0);
      }
      else {
         unlock(lock_fd0);
      }
   }

   string read_line(int fd) {
      char ch;
      string ret;
      char buffer[IO_BUFFER_SIZE];

      memset(buffer, 0, IO_BUFFER_SIZE);
    
      size_t i = 0;

      while(read(fd, &ch, 1) == 1) {
         if(ch == '\n') {
            break;
         }

         buffer[i] = ch;
 
         i++;

         if(i >= IO_BUFFER_SIZE) {
            break;
         }
      }

      ret = string(buffer);

      return ret;
   }

   bool get_next_URL(string & url) {
      lock(lock_fd1);

      url = read_line(fd[0]);

      unlock(lock_fd1);

      if(url.size() == 0) {
         return false;
      }

      return true;
   }

   string get_URI(const string & url) {
      string uri;

//cout << "get_URI: \"" << url << "\"" << endl << flush;

      size_t i;

      for(i = 7; i < url.size(); i++) {
         if(url[i] == '/') {
            while(i < url.size()) {
               uri += url[i];
               i++;
            }
            break;
         }
      }

      if(uri == "") {
         uri = "/";
      }

      return uri;
   }

   bool is_url_char(const char ch) {
      bool ret = false;

      if(isdigit(ch) || isalpha(ch) || ch == '\?' || ch == '='
         || ch == '/' || ch == '.' || ch == '%' || ch == '+' || ch == '&') {
         ret = true;
      }

      return ret;
   }
  

   void process_page_data(const string & url, 
                          const char * buffer, 
                          const size_t size) {
      if(buffer == 0) {
         return;
      }

      if(strcasestr(buffer, "Content-type: text/html") == NULL) {
         return;
      }

      char * ptr = strcasestr(buffer, "href=");
      char * p;
      string str;

      while(ptr != NULL) {
         ptr += 5; 

         if(*ptr == '\"') {
            ptr++;

            p = ptr;
         }

         p = ptr;

         while((*p) != '\"' && (*p) != '\'' 
               && (*p) != ' ' && (*p) != '>'
               && (*p) != '#') {
            str += *p;
            p++;
         }

         if(strcasestr(str.c_str(), "mailto:") != NULL) {

         }
         else if(strcasestr(str.c_str(), "https://") != NULL) {

         }
         else if(strcasestr(str.c_str(), "telnet://") != NULL) {

         }
         else if(strcasestr(str.c_str(), "ftp://") != NULL) {

         }
         else if(strcasestr(str.c_str(), "http://") != NULL)  {
            write_URL("http://" + get_domain_name(str) + "/");
            if(get_URI(str) != "" && get_URI(str) != "/") {
               write_URL(str);
            }
         }
         else {
            write_URL("http://" + get_domain_name(url) + "/");
            if(str[0] == '/') {
               write_URL("http://" + get_domain_name(url) + str);
            }
            else {
               write_URL("http://" + get_domain_name(url) + "/" + str);
            }
         }

         str = "";

         ptr = strcasestr(ptr, "href=");
      }
   }

   string get_title(const char * buffer) {
      string ret;

      const char * p0 = strcasestr(buffer, "<title>");
      const char * p1 = strcasestr(buffer, "</title>");

      if(p0 && p1) {
         p0 += 7;

         while(p0 != p1) {
            if(*p0 == '\r' || *p0 == '\n')
               ret += ' ';
            else
               ret += (*p0);
            p0++;
         }
      }

      return ret;
   }

   string get_version(const char * buffer) {
      string ret;

      const char * p = strcasestr(buffer, "Server: ");

      if(p) {
          p += 8;
          while(*p != '\r' && *p != '\n') {
             ret += *p;
             p++;
          }
      }

      return ret;
   }

   bool is_valid(char ch) {
      bool ret = false;

      if((ch >= 'a' && ch <= 'z')
         || (ch >= 'A' && ch <= 'Z')
         || (ch >= '0' && ch <= '9')) {
         ret = true;
      } 
      else if(ch == ' ' || ch == '\t') {
         ret = true;
      }
      else {
         switch(ch) {
         case '`':
         case '~':
         case '@':
         case '#':
         case '$':
         case '%':
         case '^':
         case '&':
         case '*':
         case '(':
         case ')':
         case '-':
         case '_':
         case '=':
         case '+':
         case '{':
         case '}':
         case '[':
         case ']':
         case '|':
         case '\\':
         case '\"':
         case '\'':
         case ':':
         case ';':
         case '<':
         case '>':
         case ',':
         case '.':
         case '/':
         case '\?':
         ret = true;
         break;
         }
      }

      return ret;
   }

   string clean_string(const string & str) {
      string ret;

      for(size_t i = 0; i < str.size(); i++) {
         if(is_valid(str[i])) {
            ret += str[i];
         }
      }

      return ret;
   }

   void process_page(const string & url) {
      //cout << "PROCESS_URL: " << url << endl << flush;
      string domain_name = get_domain_name(url);
      string uri = get_URI(url);
      if(uri.size() == 0) {
         uri = "/";
      }

//cout << "domain_name: \"" << domain_name << "\"" << endl << flush;
//cout << "URI: \"" << uri << "\"" << endl << flush;

      string request = "GET " + uri + " HTTP/1.0\r\nUser-agent: TitleBot\r\n\r\n";

//cout << request << endl << flush;

      if(domain_name.size() == 0) {
         return;
      }

      tcp_stream stream(domain_name.c_str(), 80);

      if(stream) {
         //cout << request << flush;

         stream << request;

         string page;

         char ch = stream.get();

         while(stream) {
            if(page.size() > (1024 * 1024)) {
               break;
            }

            page += ch;
 
            ch = stream.get();
         }

         stream.close();

         size_t size = page.size();

         char * buffer = new char[size+1];
      
         memset(buffer, 0, size+1);

         copy(page.begin(), page.end(), buffer);

         page.clear();

         if(strcasestr(buffer, "200 OK") != NULL 
            || strcasestr(buffer, "302 Found") != NULL) {
            title_string = get_title(buffer);
            version_string = get_version(buffer);

            process_page_data(url, buffer, size);

            uri = get_URI(url);
            if(uri == "" || uri == "/" || uri == "/index.html" || uri == "/index.htm" 
               || uri == "/default.asp" || uri == "/index.php" || uri == "/index.jsp" || uri == "/index.asp") {
               lock(lock_fd2);
               (*out) << "\"" << url << "\",\"" << clean_string(title_string) << "\",\""
                      << clean_string(version_string) << "\"" << endl << flush;
               unlock(lock_fd2);
            }
         }

         delete [] buffer;
      }
   }

   void run() {
      string the_url;

      while(get_next_URL(the_url)) {
         cout << the_url << endl << flush;

         process_page(the_url);
      }
   }
};

void
on_SIGCHLD(int sig)
{
   exit(0);
}


typedef vector< robot* > robot_vector_type;

void run_robot(const string & url) {
   robot_vector_type robot_vec;

   ofstream fout("./url_list.txt", ios::trunc | ios::ate);

   if(fout) {
      fout << url << endl << flush;

      fout.close();
   }

   ofstream out("./url.txt", ios::trunc | ios::ate);

   if(out) {

      int fd[2] = { 0, 0 };

      int rc = pipe(fd);

      int flags = 0;

      flags = fcntl(fd[1], F_GETFL, 0);

      fcntl(fd[1], F_SETFL, flags | O_NONBLOCK);

      if(rc < 0) {
         cerr << "unable to create pipe" << endl << flush;
         exit(0);
      }

      vector< int > pid_vec;

      int pid;

      string t = url + "\n";

      write(fd[1], t.c_str(), t.size());

      bool running = true;
      the_count = 0;
      int lock_fd0 = open("./lock0.file", O_RDWR, 0666);
      int lock_fd1 = open("./lock1.file", O_RDWR, 0666);
      int lock_fd2 = open("./lock2.file", O_RDWR, 0666);

      typedef vector< int > int_vector_type;
      int_vector_type int_vec;

      while(running) {
         if((pid = fork()) == 0) {
            signal(SIGCHLD, on_SIGCHLD);

            robot bot(fd, lock_fd0, lock_fd1, lock_fd2, url, out);

            bot.run();

            exit(0);
         }

         int_vec.push_back(pid);

         the_count++;

         if(the_count >= MAX_THREADS) {
            int status = 0;

            int the_pid;

            for(size_t i = 0; i < int_vec.size(); i++) {
               if((the_pid= waitpid(int_vec[i], &status, 0)) > 0) {
                  the_count--;
               }
            }

            int_vec.clear();
         }
      }

      out.flush();

      out.close();
   }
}

void
signal_handler(int arg)
{
   switch(arg) {
   case SIGHUP:
   case SIGINT:
   case SIGTERM:
      exit(0);
   break;

   case SIGCHLD:
   break;
   }
}

int
main(int argc,
     char ** argv,
     char ** envp)
{
   signal(SIGHUP, signal_handler);
   signal(SIGINT, signal_handler);
   signal(SIGTERM, signal_handler);
   signal(SIGCHLD, signal_handler);

   int ret = 0;

   if(argc != 2) {
      cerr << "usage: " << argv[0] << " <URL>" << endl;
      exit(1);
   }

   run_robot(string(argv[1]));

   return ret;
}

