/*
    Copyright (C) 2009  Matthew William Coan

    TitleBot HTTP web robot.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.


TitleBot -- WWW robot that creates a list of domain names of 
            all the web servers and the title of the root document for
            each one.

This domain name and title data could be useful to others.

Download all domain names and root web page titles on the WWW 
using Berkeley sockets and C++.

World wide web title and domain name collecting world wide web robot.

www-robot.cpp -- a world wide web Internet web robot (this file).

robots.txt -- each site that has this file will be 
              processed according to the rules of 
              the robots.txt file on there web server 
              root directory.
           -- 


Example:
   http://www.domain_name.com/robots.txt

Author: Matthew William Coan
Date: Mon Nov 10 21:46:48 EST 2008

*/

#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <list>
#include <vector>
#include <algorithm>
#include <cctype>
#include <cstdlib>
#include <cstring>
#include <strings.h>
#include <arpa/inet.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/file.h>
#include <netdb.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>

#include "str.h"
#include "csv.h"

#define ONLY_ONE_WEB_SITE false
#define NUMBER_OF_THREADS 5
#define URL_DOCUMENT_DEPTH 1
#define HTTP_TIME_OUT 8 
#define HTTP_ROBOT_VERSION "1.0.0a"
#define HTTP_ROBOT_NAME "TitleBot"
#define HTTP_CONTENT_LENGTH ((1024 * 1024) * 2)
#define URL_FILE_NAME "./www-titles.csv"
#define ROBOTS_TXT_URI "/robots.txt"

map< string, string > URL_map;

namespace TitleBot {

using namespace std;
using namespace csv_reader;

int thread_count = 0;

pthread_cond_t _cond;
pthread_mutex_t _mutex;
pthread_mutex_t _mutex2;
pthread_mutex_t _connect_mutex;

typedef vector< string > string_vector_type;

// a class to represent the robots.txt 
// robt HTTP web server 
class robots_txt {
   string _user_agent;
   string _domain_name;
   int _port;
   string_vector_type _exclude;

public:
   robots_txt()
   {
      _port = 0;
   }

   robots_txt(const robots_txt & robot) 
   {
      _user_agent = robot._user_agent;
      _domain_name = robot._domain_name;
      _port = robot._port;
      _exclude = robot._exclude;
   }

   robots_txt(const string & user_agent,
              const string & domain_name,
              const int port)
   {
      _user_agent = user_agent;
      _domain_name = domain_name;
      _port = port;
   }
              

   virtual ~robots_txt() 
   {
   }

   robots_txt & operator=(const robots_txt & right)
   {
      _user_agent = right._user_agent;
      _domain_name = right._domain_name;
      _port = right._port;
      _exclude = right._exclude;

      return *this;
   }

   bool match_URI(const string & URI, 
                  const string & ex)
   {
/**
      if(strstr(URI.c_str(), ex.c_str()) == URI.c_str()) {
         return true;
      }
      else {
         return false;
      }
**/

      bool ret = false;

      register int i,j;

cout << "match_URI(\"" << URI << "\",\"" << ex << "\")" << endl << flush;

      if(ex == "*" || ex == "") {
         ret = true;
      }
      else {
         if(URI.size() >= ex.size()) {
            if(strstr(URI.c_str(), ex.c_str()) == URI.c_str()) {
               ret = true;
            }
            else {
               for(j = 0; (j < ex.size()) && (j < URI.size()); j++) {
                  if(URI[j] == ex[j]) {
                     ret = true;
                  }
                  else if(ex[j] == '*') {
                     string temp;

                     for(int k = j + 1; 
                         (k < ex.size()); 
                         k++) {
                        temp += ex[k];
                     }

                     bool found = false;
                     int k, g;

                     for(k = 0; k < temp.size(); k++) {
                        for(g = 0; g < ex.size() 
                            && (k + g) < temp.size(); g++) {
                           if(temp[k + g] == ex[g]) {
                              found = true;
                           }
                           else {
                              found = false;

                              break;
                           }
                        }

                        if(found && g >= ex.size()) {
                           break;
                        }
                     }

                     ret = found;
                  }
                  else {
                     ret = false;

                     break;
                  }
               }

               if(j < ex.size()) {
                  ret = false;
               }
            }
         }
      }

cout << "return: " << ret << endl << flush;

      return ret;
   }

   void exclude(const string & URI) 
   {
      if(URI == "") {
         return;
      }
/*

      string temp;

      if(URI[URI.size() - 1] == '/') {
         for(int i = 0; i < (URI.size() - 1); i++) {
            temp += URI[i];
         }
      }
      else {
         temp = URI;
      }
*/

//cout << "exclude(\"" << temp << "\")" << endl << flush;
//cout << "_exclude.size() == " << _exclude.size() << endl << flush;

cout << "DO_EXCLUDE: " << URI << endl << flush;

      _exclude.push_back(URI);
   }

   bool allow(const string & URI) 
   {
      bool ret = true;

//cout << "_exclude.size() == " << _exclude.size() << endl << flush;

      string str;

      for(size_t i = 0; i < _exclude.size(); i++) {
         str = _exclude[i];
cout << "EXCLUDE: " << str << endl << flush;

//cout << "_exclude[" << i << "] == \"" << _exclude[i] 
//     << "\";" << endl << flush;
//

//cout << "EXCLUDE-SEARCH: \"" << (_exclude[i]) << "\"" << endl << flush;

         if(str == "") {
            continue;
         }

         if((str.size() <= URI.size()) && (str != "*")) {
            if(match_URI(URI, str)) {
               ret = false;
               break;
            }
         }
         else if(str == "*") {
            ret = false;
            break;
         }

//cout << "str: " << str << endl << flush;
//cout << "URI: " << URI << endl << flush;
      }

      if(ret) {
//cout << "ALLOW: " << URI << endl << flush;
      }
      else {
cout << "DISALLOW: " << URI << endl << flush;
      }

//cout << "_exclude.size() == " << _exclude.size() << endl << flush;

      return ret;
   }

   const string &
   get_domain_name()
   {
      return _domain_name;
   }

   template< class T_istream >
   bool
   do_read(T_istream & input) 
   {
//cout << "do_read()..." << endl << flush;
      bool ret;
      size_t sz;

      sz = 0;

//cout << "do_read: " << input.tellg() << endl << flush;

      input.read(reinterpret_cast< char* >(&sz), sizeof(size_t));

      if(!input) {
//cout << "NOT FOUND" << endl << flush;
         return false;
      }
//cout << "1)" << endl << flush;

      char * buf = new char [sz];

      if(!buf) {
         return false;
      }
//cout << "2)" << endl << flush;

      memset(buf, 0, sz);

      input.read(buf, sz);

      if(!input) {
         delete [] buf;

         return false;
      }
//cout << "3)" << endl << flush;

      _user_agent = string(buf);

      delete [] buf;

      sz = 0;

      input.read(reinterpret_cast< char* >(&sz), sizeof(size_t));

      if(!input) {
         return false;
      }
//cout << "4)" << endl << flush;

      buf = new char[sz];

      if(!buf) {
         return false;
      }

//cout << "5)" << endl << flush;

      memset(buf, 0, sz);

      input.read(buf, sz);

      if(!input) {
         delete [] buf;

         return false;
      }

//cout << "6)" << endl << flush;

      _domain_name = string(buf);

      delete [] buf;

      input.read((char*)(&_port), sizeof(int));

      if(!input) {
         return false;
      }
//cout << "7)" << endl << flush;

      int n = 0;

      //input.clear();

      input.read((char*)(&n), sizeof(int));

      if(!input) {
         return false;
      }

//cout << "8)" << endl << flush;
cout << "READ EXCLUDE COUNT: " << n << endl << flush;
      _exclude.clear();

      for(int i = 0; i < n; i++) {
         input.read((char*)(&sz), sizeof(size_t));

         if(!input) {
            return false;
         }
//cout << "9)" << endl << flush;

         buf = new char [sz];

         if(!buf) {
            return false;
         }
//cout << "10)" << endl << flush;

         memset(buf, 0, sz);

         input.read(buf, sz);

         if(!input) {
            delete [] buf;
            return false;
         }

//cout << "11)" << endl << flush;

//cout << "2) exclude: " << buf << flush; 

cout << "READ_EXCLUDE: " << buf << endl << flush;

         _exclude.push_back(string(buf));

         delete [] buf;
      }
//cout << "user_agent: " << _user_agent << endl << flush;
//cout << "domain_name: " << _domain_name << endl << flush;
//cout << "done: do_read()..." << endl << flush;

      return true;
   }

/*
   string _user_agent;
   string _domain_name;
   int _port;
   string_vector_type _exclude;
*/

   template< class T_ostream >
   bool 
   do_write(T_ostream & output)
   {
//cout << "do_write: " << output.tellg() << endl << flush;
//cout << "user_agent: " << _user_agent << endl << flush;
//cout << "domain_name: " << _domain_name << endl << flush;
      size_t sz = _user_agent.size() + 1;

      output.write((char*)(&sz), sizeof(size_t));

      output.write(_user_agent.c_str(), _user_agent.size() + 1);

      sz = _domain_name.size() + 1;

      output.write((char*)(&sz), sizeof(size_t));

      output.write(_domain_name.c_str(), sz);

      output.write((char*)(&_port), sizeof(int));

      sz = _exclude.size();

      output.write((char*)(&sz), sizeof(size_t));

      for(size_t ii = 0; ii < _exclude.size(); ii++) {
cout << "write exclude: " << (_exclude[ii]) << endl << flush;

         string & temp = _exclude[ii];

         sz = temp.size() + 1;

         output.write((char*)(&sz), sizeof(size_t));

         output.write(temp.c_str(), sz);

         output.flush();
      }

      output.flush();

//cout << "done: do_write()..." << endl << flush;

cout << "do_write: _exclude.size() == " << _exclude.size() << endl << flush;

      return true;
   }
};

template< class T_istream >
T_istream &
operator>>(T_istream & input, robots_txt & bot) 
{
//cout << "operator>>" << endl << flush;
   bot.do_read(input);

//cout << "done operator>>" << endl << flush;

   return input;
}

template< class T_ostream >
T_ostream &
operator<<(T_ostream & output, robots_txt & bot) 
{
//cout << "operator<<" << endl << flush;
   bot.do_write(output);

//cout << "done operator<<" << endl << flush;

   return output;
}


inline
int
connectWithTimeout(int sfd,
                   sockaddr * addr,
                   int addrlen,
                   timeval * timeout)
{

//cout << "CONNECTING..." << endl << flush;

   timeval sv;
   socklen_t svlen = sizeof(sv);
   int ret;

   if(!timeout) {
      return connect(sfd, addr, addrlen);
   }

   if(setsockopt(sfd, 
                 SOL_SOCKET, 
                 SO_RCVTIMEO, 
                 (char *)timeout,
                 sizeof(*timeout)) < 0) {
      return -1;
   }

   if(setsockopt(sfd,
                 SOL_SOCKET,
                 SO_SNDTIMEO,
                 (char *)timeout,
                 sizeof(*timeout)) < 0) {
      return -1;
   }

   ret = connect(sfd, addr, addrlen);

   return ret;
}

inline
int 
Connect(int sockfd,
        sockaddr * serv_addr,
        size_t size)
{
   timeval tv;

   memset(&tv, 0, sizeof(tv));

   tv.tv_sec = HTTP_TIME_OUT;
   tv.tv_usec = 0;

   int ret = connectWithTimeout(sockfd, serv_addr, size, &tv);

   return ret;
}

int fork_count = 0;
int _thread_count = 0;

typedef map< string, robots_txt > robot_map_type;

class domain_name_finder {
public:
   typedef list< string > string_list_type;
   typedef map< string, robots_txt > robot_map_type;

private:
   string current_domain;
   string_list_type & _queue;
   string _domain_name;
   string _currentURI;
   int _wait_count;
   fstream _robot_file;
   string _current_URL;
   robot_map_type _robot_map;

   void
   write_robot(robots_txt & bot)
   {
      _robot_map[bot.get_domain_name()] = bot;
   }

   robots_txt
   read_robot(const string & url, bool & error)
   {
      robots_txt bot = _robot_map[get_domain_name(url)];

      return bot;
   }

   string
   robot_URL(const string & url)
   {
      string ret;

      int port = 80;//get_port(url);

      const size_t SZ = 64;

      char buf[SZ]; 

      memset(buf, 0, SZ);

      sprintf(buf, "%d", port);

      string str_port = string(buf);

      if(port == 80) {
         ret = string("http://") 
               + get_domain_name(url) 
               + string("/robots.txt");
      }
      else {
         ret = string("http://") 
               + get_domain_name(url) 
               + string(":")
               + str_port
               + string("/robots.txt");
      }

      return ret;
   }

   void 
   write_URL(const string & the_url)
   {
      bool first = true;
      string url = "";

      for(size_t i = 0; i < the_url.size(); i++) {
         if(the_url[i] == '?' || the_url[i] == ';') {
            if(!first) {
               break;
            }
            first = false;
         }
         url += the_url[i];
      }

      pthread_mutex_lock(&_connect_mutex);
      if(_url_map.find(url) == _url_map.end()) {
cout << "WRITE URL: " << url << endl << flush;
         _url_list.push_back(url);
         _url_map[url] = true;
      }
      pthread_cond_broadcast(&_cond);
      pthread_mutex_unlock(&_connect_mutex);
   }

   list< string > _url_list;
   map< string, bool > _url_map;

   string 
   read_URL() 
   {
      string url;
      pthread_mutex_lock(&_connect_mutex);

      while(_url_list.size() == 0) {
         _thread_count++;

         //if(_thread_count == NUMBER_OF_THREADS) {
         //   pthread_cond_broadcast(&_cond);
         //   pthread_mutex_unlock(&_connect_mutex);
//cout << "pthread_exit..." << endl << flush;
         //   pthread_exit(NULL);
         //}

         pthread_cond_wait(&_cond, &_connect_mutex);

         _thread_count--;
      }

      url = _url_list.front();
      _url_list.pop_front();

      pthread_mutex_unlock(&_connect_mutex);

      return url;
   }

   int
   open_connection(const string & name,
                   const int port)
   {
      int fd;
      int sockfd;
      sockaddr_in serv_addr;
      hostent * hostptr = NULL;
      const char * ip_address = NULL;
      unsigned int addr;

      _domain_name = name;

      bzero(reinterpret_cast< char* >(&serv_addr), sizeof(serv_addr));
      serv_addr.sin_family = AF_INET;

//cout << "OPEN_CONNECTION(\"" << name << "\")" << endl << flush;

      if(is_ip_address(name)) {
//cout << "IP ADDRESS: " << name << endl << flush;
         ip_address = name.c_str();

         addr = inet_addr(ip_address);

         hostptr = gethostbyaddr((char*)&addr, 
                                 sizeof(unsigned int), 
                                 AF_INET);

         if(hostptr) {
            bcopy(hostptr->h_addr, 
                  &(serv_addr.sin_addr.s_addr), 
                  hostptr->h_length);
         }
      }
      else if((hostptr = gethostbyname(name.c_str())) != NULL) {
         ip_address = hostptr->h_addr;

         bcopy(hostptr->h_addr, 
               &(serv_addr.sin_addr.s_addr), 
               hostptr->h_length);
      }
      else {
         return -1;
      }

      serv_addr.sin_port = htons(port);

      if((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
         cerr << "unable to create socket for: " 
              << name << ":" << port << endl;

         return -1;
      }

//cout << "connect next..." << endl << flush;

      if(Connect(sockfd, 
                 (struct sockaddr *) &serv_addr,
                 sizeof(serv_addr)) < 0) {
         cerr << "unable to open connection: " 
              << name << ":" << port << endl << flush;

         fd = -1;
      }
      else {
         fd = sockfd;
      }

      return fd;
   }

   const char *
   my_strstr(const char * page,
             const char * str)
   {
      const char * ptr = page;
      const char * ptr0 = str;
      const char * ret = 0;

      while(*ptr) {
         ptr0 = str;

         if(tolower(*ptr) == tolower(*ptr0)) {
            int count = 0;

            const char * p = ptr;
 
            const char * old = ptr;

            while(*ptr && *ptr0) {
               if(tolower(*ptr) != tolower(*ptr0)) {
                  break;
               }

               ptr0++;

               ptr++;

               count++;
            }

            if(count == strlen(str)) {
               ret = p;

               break;
            }
            else {
               ptr = old;   
            }
         }

         ptr++;
      }

      return ret;
   }

   void
   process_web_page(char * page)
   {

//cout << page << flush;

//cout << "process_web_page(\"" <<  page << "\")" << endl << flush;

      if(my_strstr(page, "CONTNET-TYPE: TEXT/HTML") == NULL) {
         return;
      }

      const char * ptr;
      const char * search[] = { "HREF=", "SRC=" };
      const size_t SZ = 2;

      for(int j = 0; j < SZ; j++) {
         ptr = page;

         const char * p = my_strstr(ptr, search[j]);
         string temp;
         int i;
         size_t sz = strlen(page);
         bool got_question_mark = false;

         while(p != NULL) {
            if(my_strstr(p, search[j]) == p) {
               p += 5;
            }

            if((*p) == '\"') {
               p++;
            }

            temp = "";

            got_question_mark = false;

            for(i = 0; 
                i < sz 
                && (*p) != '\0' 
                && (*p) != '\"' 
                && (*p) != '>' 
                && (*p) != '<' 
                && (*p) != '\'' 
                && (*p) != '#' 
                //&& (*p) != '\?' 
                //&& (*p) != ';'
		//&& (*p) != '&'
                //&& (*p) != '='
                && !isspace(*p); i++) {

               if((*p) == '?') {
                  got_question_mark = true;
               }
               else if((*p) == '&' && got_question_mark == false) {
                  break;
               }

               temp += *p;

               p++;
            }

            if(strstr(temp.c_str(), "http://") == NULL) {
               if(temp[0] == '/') {
                  temp = string("http://") 
                         + _domain_name  
                         + temp;
               }
               else {
                  temp = string("http://")
                         + _domain_name
                         + string("/")
                         + temp;
               }
            }
            else if(strstr(temp.c_str(), "https://") == NULL
                    || strstr(temp.c_str(), "mailto:") == NULL) {
               string t;

               t = string("http://")
                   + get_domain_name(temp)
                   + get_URI(temp);

               temp = string(t);
            }

//cout << "temp: " << temp << endl << flush;

            write_URL(temp);

/**/
            int count = 0;

            for(size_t x = 0; x < temp.size(); x++) {
               if(temp[x] == '/') {
                  count++;
               }
               else if(temp[x] == '\?') {
                  break;
               }
            }

//cout << "count: " << count << endl << flush;


            if(count > (URL_DOCUMENT_DEPTH + 2)) {
               ptr = p;

               p = my_strstr(ptr, search[j]);

               continue;
            }
/**/

            //if(_url_map.find(temp) == _url_map.end()) {
            //if(!find_URL(temp)) {
//cout << "NEW URL: " << temp << endl << flush;
            //   write_URL(temp);
//cout << "DONE WRITE URL" << endl << flush;
            //}
            //}

            ptr = p;

            p = my_strstr(ptr, search[j]);
         }
      }
   }

   void
   process_web_page(list< string > & page)
   {
      char * line = 0;

      for(list< string >::iterator p = page.begin();
          p != page.end(); p++) {
         line = new char[p->size() + 1];

         memset(line, 0, p->size() + 1);

         strcpy(line, p->c_str());

         process_web_page(line);

         delete [] line;
      }
   }

   string
   read_line(int fd, 
             bool & error)
   {
      string ret;

      int rc = -1;

      char ch;
     
      error = false;

      while(true) {
         if((rc = read(fd, &ch, 1)) != 1) {
            error = true;

            break;
         }

         if(!isspace(ch) && !isalnum(ch) && !ispunct(ch)) {
            ch = ' ';
         }

         ret += ch;
         
         if(ch == '\n') {
            break;
         }
      }

      if(rc < 0) {
         error = true;
      }

      if(error && ret.size() > 0) {
         error = false;
      }

      return ret;
   }

   string
   strip_HTML(const string & arg)
   {
      string ret;
      bool in = false;

      for(int i = 0; i < arg.size(); i++) {
         if(arg[i] == '<') {
            in = true;
         }
         else if(arg[i] == '>') {
            in = false;
         }
         else if(arg[i] == '&') {
            in = true;
         }
         else if(arg[i] == ';' && in) {
            in = false;

            ret += " ";
         }
         else if(in) {
            ;
         }
         else {
            ret += arg[i];
         }
      }

      return ret;
   }

   string
   trim(const string & str)
   {
      char * buffer = new char[str.size() + 1];

      memset(buffer, 0, str.size() + 1);

      strcpy(buffer, str.c_str());

      const char sep[] = " \r\n\t\"";

      char * p = strtok(buffer, sep);

      list< char* > word_list;

      while(p != NULL) {
         word_list.push_back(p);

         p = strtok(NULL, sep);
      }
   
      string ret;

      for(list< char* >::iterator pp = word_list.begin(); 
          pp != word_list.end(); pp++) {
         if(ret.size() != 0) {
            ret += string(" ");
         }

         ret += string(*pp);
      }

      word_list.clear();

      delete [] buffer;

      return ret;
   }

   string
   get_title(list< string > & the_page)
   {
      bool in = false;
      string line;
      string line2;

      for(list< string >::iterator p = the_page.begin(); 
          p != the_page.end(); p++) {
         line = (*p);

         if(my_strstr((char*)(line.c_str()), 
            "<title>") != NULL) {
            in = true;

            line2 = "";
         }

         if(in) {
            line2 += line;
         }

         if(my_strstr((char*)(line.c_str()), 
            "</title>") != NULL) {
            break;
         }
      }

      string title;

      char * page = new char[line2.size() + 1];

      memset(page, 0, line2.size() + 1);

      strcpy(page, line2.c_str());

      const char * begin = my_strstr(page, "<title>");

      const char * end = my_strstr(page, "</title>");

      if(begin && end) {
         if(begin < end) {
            for(const char * p = (begin + 7); p != end; p++) {
               title += (*p);
            }
         }

         title = trim(strip_HTML(title));
      }

      delete [] page;

      return title;
   }

   string
   prep_URI(const string uri)
   {
      string ret;
      size_t i = 0;
      size_t j = 0;

      if(uri.size() > 0) {
         for(i = uri.size() - 1; i >= 0; i--) {
            if(uri[i] == '/') {
               break;
            }
         }
      }

      j = 0;

      while(j <= i && j < uri.size()) {
         ret += uri[j];

         j++;
      }

      return ret;
   }

   void 
   process_web_page(int fd, 
                    string & uri,
                    const string & query_string, 
                    string & user_agent,
                    const string & url)
   {
//cout << "process_web_page(fd)" << endl << flush;
      string request;

      int count = 0;

      if(uri.size() == 0) {
         uri = string("/");
      }

      for(int i = 0; i < uri.size(); i++) {
         if(uri[i] == '/') {
            count++;
         }
      }

      if(count > URL_DOCUMENT_DEPTH) { 
         return;
      }

      if(query_string.size() == 0) {
         request = string("GET ") + uri 
                   + string(" HTTP/1.0\r\n")
                   + string("User-Agent: ") 
                   + string(HTTP_ROBOT_NAME) + string("\r\n")
                   + string("Accept: text/html\r\n")
                   + string("\r\n");
      }
      else {
         request = string("GET ") + uri 
                   + string("?") + query_string 
                   + string(" HTTP/1.0\r\n")
                   + string("User-Agent: ") 
                   + string(HTTP_ROBOT_NAME) + string("\r\n")
                   + string("Accept: text/html\r\n")
                   + string("\r\n");
      }

      if(write(fd, 
               request.c_str(), 
               request.size()) 
               != request.size()) {
         cerr << "error: write HTTP request error:" << endl;
         return;
      }

      char ch;

      string line;
      string header;
      bool error = false;
      bool first = true;

      while((line = read_line(fd, error)) != "\r\n") {

         if(error) {
//cout << "I/O ERROR" << endl << flush;

            return;
         }

//cout << "HTTP(\"" << line << "\")" << endl << flush;

/**/
         if(first) {
            first = false;

//cout << "HTTP-Response: " << trim(line) << endl << flush;
//cin.get(),cin.get();

            if((my_strstr(line.c_str(), "200 OK") == NULL)
               && (my_strstr(line.c_str(), "302 Found") == NULL)) {
//cout << "HTTP-Response: ERROR: " << trim(line) << endl << flush;
               return;
            }
            else {
//cout << "HTTP-Response: " << trim(line) << endl << flush;
            }
         }
/**/

         count++;

         if(count >= 50) {
            break;
         }

         header += line;
      }

      //cout << "---BEGIN-HEADER---" << endl 
      //     << header  
      //     << "---END-HEADER---" << endl << flush;

      if(strstr(header.c_str(), "text/html") == NULL) {
//cout << "NOT TEXT/HTML" << endl << flush;
         return;
      }

      const char * ptr0 = my_strstr(header.c_str(), 
                                    "content-length:");

      const char * p_ua = my_strstr(header.c_str(),
                                    "server:");

      if(p_ua) {
         p_ua += 8;

         while(*p_ua != '\r' && *p_ua != '\n') {
            user_agent += *p_ua;

            p_ua++;
         }
      }

      size_t sz = 0;
      bool flag = false;

      if(ptr0 != NULL) {

         ptr0 += 15;

         while(isspace(*ptr0)) {
            ptr0++;
         }

         string s;

         while(isdigit(*ptr0)) {
            s += *ptr0;

            ptr0++;

            if(isspace(*ptr0)) {
               break;
            }
         }

         sz = atoi(s.c_str());

         flag = true;
      }

      if(sz > HTTP_CONTENT_LENGTH) {
//cout << "RETURN #0" << endl << flush;
         return;
      }

/*
      if((flag == true) && (sz == 0)) {
         return;
      }
*/

      list< string > page;
      error = false;

      while(true) {
         error = false;

         line = read_line(fd, error);

         if(error) {
//cout << "READ LINE ERROR BREAK" << endl << flush;
            break;
         }         

//cout << "LINE(" << line << ")" << endl << flush;

/**/
         for(int x = 0; x < line.size(); x++) {
            ch = line[x];

            if(!isalnum(ch) 
               && !ispunct(ch) 
               && !isspace(ch)
               && !(ch >= 0)) {
//cout << "RETURN #1" << endl << flush;

               error = true;

               break;
            }
         }

         if(error) {
            break;
         }
/**/
         
         page.push_back(line);
      }

      if(strcmp(uri.c_str(), "") == 0) {
         uri = string("/");
      }

      string title = get_title(page);

      _domain_name = get_domain_name(url);

      _currentURI = get_URI(url);

      process_web_page(page);

      save_URL(url, title, user_agent);
   }   

   string
   remove_amp(const string & arg1)
   {
      string ret;

      for(size_t i = 0; i < arg1.size(); i++) {
         if((i + 3) < arg1.size()) {
            if(arg1[i] == 'a' 
               && arg1[i+1] == 'm' 
               && arg1[i+2] == 'p'
               && arg1[i+3] == ';') {
               i += 3;
            }
            else {
               ret += arg1[i];
            }
         }
         else {
            ret += arg1[i];
         }
      }

      return ret;
   }

   typedef map< string, string > string_map_type;

   fstream _out;

   //string_map_type URL_map;

   bool
   is_root_document(const char * uri)
   {
      bool ret;

      if(strcmp(uri, "/") == 0
         || strcmp(uri, "") == 0
         || strcmp(uri, "/default.html") == 0
         || strcmp(uri, "/index.asp") == 0
         || strcmp(uri, "/index.php") == 0
         || strcmp(uri, "/index.jsp") == 0
         || strcmp(uri, "/index.html") == 0
         || strcmp(uri, "/default.htm") == 0
         || strcmp(uri, "/index.htm") == 0) {
         ret = true;
      }
      else {
         ret = false;
      }

      return ret;
   }


   void
   save_URL(const string & the_url,
            const string & the_title,
            const string & user_agent)
   {
      pthread_mutex_lock(&_mutex2);

      fstream out(URL_FILE_NAME, ios::out | ios::app);

      if(!out) {
         cerr << "unable to append to url file...." << endl;

         //exit(1); 

         return;
      }

      string dns = get_domain_name(the_url);

      if(URL_map.find(dns) == URL_map.end()) {
         URL_map[dns] = "true";
         out << "\"" << the_url << "\",\"" 
             << the_title << "\",\"" 
             << user_agent
             << "\"" << endl << flush;
      }

      out.close();

      pthread_mutex_unlock(&_mutex2);
   }

   bool
   is_robots_txt(const string & url)
   {
      bool ret;

      char * p = strchr(url.c_str() + 7, '/');

      if((p == my_strstr(url.c_str(), "/robots.txt")) && (p != NULL)) {
         ret = true;
      }
      else {
         ret = false;
      }

      return ret;
   }

   string
   get_robot_URI(string robot_line)
   {
      string ret;
      const char * ptr = NULL;

      if((ptr = my_strstr(robot_line.c_str(), "Disallow:")) 
         != NULL) {
         ret = trim(string(ptr + 9));
      }

      return ret;
   }

   string
   get_user_agent(string robot_line) 
   {
      string ret;
      const char * ptr = NULL;

      if((ptr = my_strstr(robot_line.c_str(), "User-agent:"))
         != NULL) {
         ret = trim(string(ptr + 12));
//cout << "user_agent: " << ret << endl << flush;
      }

      return ret;
   }

   bool
   match_user_agent(const string & robot_line,
                    const string & user_agent)
   {

      bool ret = false;

      string ua = trim(get_user_agent(robot_line));

//if(ua.size() > 0) {
//cout << "match_user_agent(\"" << ua << "\",\"" 
//     << user_agent << "\")" << endl << flush;
//}

      if(ua == "*") {
         ret = true; 
      }
      else if(user_agent == "*") {
         ret = true;
      }
      else if(strcasecmp(ua.c_str(), user_agent.c_str()) == 0) {
         ret = true;
      }

      return ret;
   }

   void
   process_robots_txt(int fd, const string & url)
   {
//cout << "process robots.txt: " << url << endl << flush;

      bool error = false;
      string temp;

      pthread_mutex_lock(&_mutex);

      //if(_robot_map.find(get_domain_name(url)) != _robot_map.end()) {

      robots_txt bot = read_robot(robot_URL(url), error);
      
/**
      if(!error) {
cout << "DONE" << endl << flush;

         pthread_mutex_unlock(&_mutex);

         return;
      }
**/

      pthread_mutex_unlock(&_mutex);

      robots_txt robot(string(HTTP_ROBOT_NAME),
                       get_domain_name(url), 
                       get_port(url));
 
      string request =  string("GET /robots.txt HTTP/1.0\r\n")
                        + string("User-Agent: ") + string(HTTP_ROBOT_NAME)
                        + string("\r\n\r\n");

      if(write(fd, request.c_str(), request.size()) != request.size()) {
         return;
      }

      error = false;

//cout << "---BEGIN-ROBOTS.TXT---" << endl << flush;
 
      string header;

      while(!error) {
         temp = read_line(fd, error);

         if(error) {
//cout << "error..." << endl << flush;
            break;
         }

//cout << temp << flush;

         if(temp == string("\r\n")) {
            break;
         }

         header += temp;
      }

      if((my_strstr(header.c_str(), "200 OK") != NULL)
         || (my_strstr(header.c_str(), "302 Found") != NULL)) {
         if((my_strstr(header.c_str(), "text/plain") != NULL)
            /*|| (my_strstr(header.c_str(), "text/html") != NULL)*/) {
            bool flag = true;

            error = false;

            while(!error) {
               temp = read_line(fd, error);

               if(error) {
//cout << "error..." << endl << flush;
                  break;
               }

               //if(temp == "") {
               //   continue;
               //}

               if(match_user_agent(temp, 
                  string(HTTP_ROBOT_NAME))) {

//cout << "MATCH_USER_AGENT(\"" << temp << "\",\"" 
//     << string(HTTP_ROBOT_NAME) << "\")" << endl << flush;

                  error = false;

                  while(!error) {
                     temp = read_line(fd, error);

//cout << "temp: " << temp << endl << flush;
 
                     if(error) {
//cout << "error: " << error << endl << flush;
                        break;
                     }

                     if(my_strstr(temp.c_str(), "User-Agent:") != NULL) {
//cout << "end..." << endl << flush;
                        flag = false;

                        break;
                     }

                     temp = get_robot_URI(temp);
 
                     if(temp.size() > 0) {
                        if((temp[temp.size() - 1] == '/')
                           && (temp.size() > 1)) {
                           string rt;

                           for(size_t i = 0; i < temp.size() - 1; i++) {
                              rt += temp[i];
                           } 

                           temp = rt;
                        }
                     }
//cout << "EXCLUDE: \"" << temp << "\"" << endl << flush;

                     robot.exclude(temp);
                  }
               }
//cout << temp << endl << flush;
            }
         }
      }
//cout << "---END-ROBOTS.TXT---" << endl << flush;

      pthread_mutex_lock(&_mutex);

      _current_URL = url;

      _robot_file.clear();

      write_robot(robot);

      pthread_mutex_unlock(&_mutex);
   }

   void
   process_web_page(const string & url)
   {

/*
cout << url << endl << flush;
pthread_mutex_lock(&_connect_mutex);
cout << "thread_count: " << thread_count << endl << flush;
cout << "thread_id: " << ((unsigned int)(pthread_self())) << endl << flush;
pthread_mutex_unlock(&_connect_mutex);
pthread_mutex_lock(&_mutex);
cout << "queue.size: " << (_queue.size()) << endl << flush;
pthread_mutex_unlock(&_mutex);
*/

      string proto;
      int port;
      string uri;
      string query_string;
      string user_agent;
      robot_map_type::iterator p;
      bool first = false;

      proto = get_proto(url);

      if(proto == "http") {

         _domain_name = get_domain_name(url);

         port = get_port(url);

         uri = get_URI(url);

         _currentURI = uri;

         // -- dissalow search in the write_URL(url) 
         //    member edunction 
         // -- robots_txt::allow(url)
         // -- this will not use the queue to much
         // -- try to speed up "/" and "/robots.txt" 
         //    -- try not to test them over and over again
         //
         query_string = remove_amp(get_query_string(url));

         int fd = -1;

         fd = open_connection(_domain_name, port);

         if(fd < 0) {
            return;
         }
         else {
            string temp_URL;
            const size_t MAX = 1024;
            char buffer[MAX];

            memset(buffer, 0, MAX);
            sprintf(buffer, "%d", port);

            if(port == 80) {
               temp_URL = string("http://") 
                          + string(_domain_name) 
                          + string("/");
            }
            else {
               temp_URL = string("http://")
                          + string(_domain_name)
                          + string(":")
                          + string(buffer)
                          + string("/");
            }

            string t_URL = temp_URL;

            if(port == 80) {
               temp_URL = string("http://")  
                          + string(_domain_name)
                          + string("/robots.txt");
            }
            else {
               temp_URL = string("http://")
                          + string(_domain_name)
                          + string(":")
                          + string(buffer)
                          + string("/robots.txt");
            }

/*****************/

//cout << "WRITE_URL: " << temp_URL << endl << flush;

            write_URL(temp_URL);

            write_URL(t_URL);

//cout << "WRITE_URL: " << t_URL << endl << flush;

/*****************/

            user_agent = "";

            string the_uri = "/";

            _current_URL = url;

            if(is_robots_txt(url)) {
               process_robots_txt(fd, url);
            }
            else {
               process_web_page(fd, the_uri, query_string, user_agent, url);
            }
         }
 
         if(fd > 0) {
            close(fd);
         }

         query_string = "";
      }
   }

   string 
   get_next_URL() 
   {
      string url;

      url = read_URL();

      return url;
   }


public:

   domain_name_finder(string_list_type & q)
   :_queue(q)
   {
      _thread_count = 0;
   }

   virtual ~domain_name_finder()
   {
   }

   int
   main_loop(int argc,
             char ** argv, 
             char ** envp)
   {
      string url;

      for(int i = 1; i < argc; i++) {
         _url_list.push_back(string(argv[i]));
         _url_map[string(argv[i])] = true;
      }

      int count = 0;

      while((url = get_next_URL()) != "") {
cout << "PROCESS URL: " << url << endl << flush;

         count++;

         //if(fork() == 0) {
            process_web_page(url);
         //}

cout << "DONE PROCESSING URL: " << url << endl << flush;
      }

      cout << "processed web page count: " << count << endl << flush;

      return 0;
   }
};


}

using namespace TitleBot;

int my_argc;
char ** my_argv;
char ** my_envp;

domain_name_finder::string_list_type * p_the_queue = NULL;

pthread_t main_thread;

void *
my_thread(void * arg)
{
   domain_name_finder * dm_finder = reinterpret_cast< domain_name_finder * >(arg);

   dm_finder->main_loop(my_argc, my_argv, my_envp);

   return NULL;
}

pthread_t thread_array[NUMBER_OF_THREADS];

bool first_signal;

void
on_signal(int sig) 
{
   pthread_mutex_lock(&_mutex);

   if(first_signal) {
      first_signal = false;

      for(size_t i = 0; i < NUMBER_OF_THREADS; i++) {
         if(pthread_self() != thread_array[i]) {
            pthread_kill(thread_array[i], SIGHUP);
        }
      }
   }

   thread_count += NUMBER_OF_THREADS;

   pthread_cond_signal(&_cond);
   //pthread_cond_broadcast(&_cond);

   pthread_mutex_unlock(&_mutex);

   pthread_exit(NULL);

   //exit(0);
}

void
do_nothing(int sig)
{
}

using namespace std;

void
read_string(fstream & f, string & str)
{
   const size_t MAX = 1024 * 8; 

   char buffer[MAX + 1];

   memset(buffer, 0, MAX + 1);

   size_t size = 0U;

   f.read((char*)(&size), sizeof(size_t));

   if(size < MAX) {
      f.read(buffer, size);
   }

   str = string(buffer); 
}

void
write_string(fstream & f, const string & str)
{
   size_t size = str.size() + 1;

   f.write((char*)(&size), sizeof(size_t));

   f.write(str.c_str(), str.size() + 1);

   f.flush();
}

size_t
string_size(const string & str) 
{
   return (str.size() + sizeof(size_t) + 1);
}

//
// main entry point:
//
int
main(int argc,
     char ** argv,
     char ** envp)
{
   if(argc < 2) {
      cerr << "usage: " << argv[0] << " <URL>"
           << endl;

      return 1;
   }

   first_signal = true;

   main_thread = pthread_self();

   signal(SIGINT, on_signal);
   signal(SIGTERM, on_signal);
   signal(SIGHUP, on_signal);
   signal(SIGCHLD, do_nothing);

   typedef list< string > string_list_type;

   using namespace TitleBot;

   my_argc = argc;
   my_argv = argv;
   my_envp = envp;

   pthread_mutex_init(&_mutex, NULL);

   pthread_mutex_init(&_mutex2, NULL);

   pthread_mutex_init(&_connect_mutex, NULL);
  
   pthread_cond_init(&_cond, NULL);

   domain_name_finder dm(*p_the_queue);

   // for processes
   for(int i = 0; i < NUMBER_OF_THREADS; i++) {
      pthread_create(&thread_array[i], 
                     NULL, 
                     my_thread,
                     reinterpret_cast< void* >(&dm));
   }

   for(int i = 0; i < NUMBER_OF_THREADS; i++) {
      pthread_join(thread_array[i], NULL);
   }

   pthread_cond_destroy(&_cond);

   pthread_mutex_destroy(&_connect_mutex);

   pthread_mutex_destroy(&_mutex2);

   pthread_mutex_destroy(&_mutex);

   return 0;
}

