package com.mwc.wsearch;

import java.util.*;
import java.io.*;
import java.net.*;
import java.sql.*;

import gnu.regexp.*;

import com.mwc.util.*;
import com.mwc.spider.*;

/**
 * A web spider for a web search engine.
 * 
 * @author Matthew W. Coan
 */
public class Indexer extends SimpleWebSpider {
   /**
    * The size of the per-page word hashtable.
    * 
    * This was chose based upon test data (Apache Web Server Manual):
    * 
    * mysql> select sum(frequency) from word_frequencies;
    * +----------------+
    * | sum(frequency) |
    * +----------------+
    * |         247505 |
    * +----------------+
    * 1 row in set (0.12 sec)
    *
    * mysql> select count(id) from documents;
    * +-----------+
    * | count(id) |
    * +-----------+
    * |       194 |
    * +-----------+
    * 1 row in set (0.00 sec)
    * 
    * The average plus 20% = (247505 / 194) + ((247505 / 194) * 0.2) = 1530.96
    */
   public static final int WORD_TABLE_SIZE = 1531;
   
   private TreeMap _disallowMap;
   private List _allow, _allowContentTypes;
   private Object _lookingForURLLock;
   private int _lookingForURL;
   private Object _getDataTaskCountLock;
   private int _getDataTaskCount;
   private Connection _connection;
   private Object _connectionLock;
   private Dictionary _urlTable;
   private Properties _props;
   private Dictionary _badWordTable;
   private boolean _debugMode;
   private Object _debugLock;
   
   private void _debug(String msg) {
      if(_debugMode) {
         synchronized(_debugLock) {
            System.out.println("[" + Thread.currentThread().getName() + "]");
            System.out.println(msg);
            System.out.flush();
         }
      }
   }
   
   private void _debug(Throwable th) {
      if(_debugMode) {
         synchronized(_debugLock) {
            PrintWriter out = new PrintWriter(new OutputStreamWriter(System.out));
            out.println("[" + Thread.currentThread().getName() + "]");
            th.printStackTrace(out);
            out.flush();
         }
      }
   }
   
   public String getProp(String p) { return _props.getProperty(p); }
   
   // Construct a web spider
   public Indexer(String userAgent, 
                  List allowContentTypes,
                  Properties props)
   throws Throwable {
      super(userAgent);
      _props = props;
      _disallowMap = new TreeMap();
      _debugMode = new Boolean(_props.getProperty("indexer.debug")).booleanValue();
      _debugLock = new Object();
      
      if(allowContentTypes == null)
         _allowContentTypes = null;
      else {
         _allowContentTypes = new LinkedList();
         Iterator p = allowContentTypes.iterator();
         while(p.hasNext())
            _allowContentTypes.add(p.next());
         Collections.sort(_allowContentTypes);
      }
      
      _lookingForURLLock = new Object();
      _lookingForURL = 0;
      _getDataTaskCountLock = new Object();
      _getDataTaskCount = 0;
      _connectionLock = new Object();
      _connection = DriverManager.getConnection(_props.getProperty("indexer.db.url"), 
                                                _props.getProperty("indexer.db.user"), 
                                                _props.getProperty("indexer.db.pass"));
      _urlTable = new MWCHashtable(3571);
      Statement s = _connection.createStatement();
      ResultSet rs = s.executeQuery("SELECT url FROM documents");
      Integer zero = new Integer(0);
      while(rs.next())
         _urlTable.put(rs.getString("url"), zero);
      rs.close();
      s.close();
   }
   
   private boolean _match( String pattern, String string ) {
      for(int p = 0; ; ++p) {
	      for(int s = 0; ; ++p, ++s)	{
		      boolean sEnd = (s >= string.length());
		      boolean pEnd = (p >= pattern.length() || pattern.charAt(p) == '|');
		      if(sEnd && pEnd)
		         return true;
		      if(sEnd || pEnd)
		         break;
		      if(pattern.charAt(p) == '?')
		         continue;
		      if(pattern.charAt(p) == '*') {
               int i;
		         ++p;
		         for(i = string.length(); i >= s; --i)
			         if(_match(pattern.substring(p), string.substring(i))) 
			            return true;
		         break;
		      }
		      if (pattern.charAt(p) != string.charAt(s))
		         break;
		   }
	      p = pattern.indexOf('|', p);
	      if(p == -1)
	         return false;
	   }
	}
   
   // Get the contents of robots.txt
   private Vector _disallowedPatternVector(String host, int port) {
      Vector disallows = null;
      try {
         URL url = new URL("http", host, port, "/robots.txt");
         synchronized(_disallowMap) {
            disallows = (Vector) _disallowMap.get(url.toExternalForm());
         }
         if(disallows != null)
            return disallows;
         disallows = new Vector();
         synchronized(_disallowMap) {
            _disallowMap.put(url.toExternalForm(), disallows);
         }
         try {
            _debug("["+Thread.currentThread().getName()+"]\n"
                   +"GET_ROBOTS_TEXT("+url.toExternalForm()+")");
            System.out.flush();
            HttpURLConnection connection = (HttpURLConnection)url.openConnection();
            if((connection.getResponseCode() != HttpURLConnection.HTTP_OK)
               || (connection.getContentType() == null)
               || (connection.getContentType().compareTo("text/plain") != 0))
               return disallows;

            BufferedReader robotReader = new BufferedReader(
                                         new InputStreamReader(connection.getInputStream()));
            boolean userAgentIsMe = false;
            while(true) {
               String line = robotReader.readLine();
               if(line == null)
                  break;
               line = line.trim();

               if (line.startsWith("#"))
                  continue;

               int cmt = line.indexOf('#');
               if(cmt != -1)
                  line = line.substring(0, cmt).trim();

               if (line.length() == 0)
                  userAgentIsMe = false;
               else if(line.toLowerCase().startsWith("user-agent:")) {
                  if(!userAgentIsMe) {
                     String value = line.substring(11).trim();
                     if(_match(value, userAgent))
                        userAgentIsMe = true;
                  }
               }
               else if (line.toLowerCase().startsWith("disallow:")) {
                  if(userAgentIsMe) {
                     String value = line.substring( 9 ).trim();
                     System.out.println("DISALLOW("+value+")");
                     disallows.addElement(value);
                  }
               }
            }
         }
         catch (IOException ignore) {
            _debug(ignore);
         }
      }
      catch ( MalformedURLException ignore ) {
         _debug(ignore);
      }
      return disallows;
   }
   
   // Register a URL with the spider
   public void registerURL(URL url)
   throws Throwable {
      if((_allow != null) && (Collections.binarySearch(_allow, url.getHost()) < 0))
         return;
      Vector disallowed = _disallowedPatternVector(url.getHost(), url.getPort());
      if(disallowed != null) {
         String pattern;
         for(int i = 0; i < disallowed.size(); i++) {
            pattern = (String)disallowed.get(i);
            if(url.getFile().startsWith(pattern))
               return;
         }
      }
      if(visited(url))
         return;
   }
   
   
   public void start(int n) {
      ThreadGroup group = new ThreadGroup("Indexer Threads");
      String priorityString = _props.getProperty("indexer.thread.priority");
      if(priorityString.equalsIgnoreCase("MAX_PRIORITY")) 
         group.setMaxPriority(Thread.MAX_PRIORITY);
      else if(priorityString.equalsIgnoreCase("MIN_PRIORITY"))
         group.setMaxPriority(Thread.MIN_PRIORITY);
      else if(priorityString.equalsIgnoreCase("NORM_PRIORITY"))
         group.setMaxPriority(Thread.NORM_PRIORITY);
      
      for(int i = 0; i < n; i++)
         new Thread(group, new GetDataTask(), 
                    "Indexer Thread " + i).start();
   }
      
   // A task to index web pages
   class GetDataTask implements Runnable {
      private Dictionary _wordTable;
      private URL _currentHost;
      private StringBuffer _blurb = null;
      
      public Dictionary wordTable() { return _wordTable; }
      public URL currentHost() { return _currentHost; }
      
      public void setBlurb(StringBuffer blurb) { _blurb = blurb; }
      
      public String encode(String s) {
         StringBuffer sb = new StringBuffer();
         for(int i = 0; i < s.length(); i++) {
            switch(s.charAt(i)) {
            case '\'':
               sb.append("\'\'");
               break;
            default:
               sb.append(s.charAt(i));
            }
         }
         return sb.toString();
      }
      
      public void run() {
         String priorityString = _props.getProperty("indexer.thread.priority");
         if(priorityString.equalsIgnoreCase("MAX_PRIORITY")) 
            Thread.currentThread().setPriority(Thread.MAX_PRIORITY);
         else if(priorityString.equalsIgnoreCase("MIN_PRIORITY"))
            Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
         else if(priorityString.equalsIgnoreCase("NORM_PRIORITY"))
            Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
         synchronized(_getDataTaskCountLock) {
            _getDataTaskCount++;
         }
         try {
            URL url;
            ResultSet rs;
            Statement s = _connection.createStatement();
            try {
               while(true) {
                  url = selectNextURL();
                  if(url == null)
                     break;
                  try {
                     // Open connection
                     HttpURLConnection connection = (HttpURLConnection)url.openConnection();
                     _currentHost = connection.getURL();
                     
                     int docID = -1;
                     long lastMod = -1;
                     synchronized(_connectionLock) {
                        rs = s.executeQuery("SELECT id, last_mod FROM documents WHERE url = '" 
                                            + _currentHost.getHost() + "'");
                        if(rs.next()) {
                           docID = rs.getInt("id");
                           lastMod = rs.getLong("last_mod");
                           rs.close();
                           s.executeUpdate("DELETE FROM word_frequencies WHERE doc_id = "
                                           + docID);
                        }
                        else {
                           rs.close();
                        }
                     }
                     
                     // Process data
                     if(connection.getLastModified() <= lastMod)
                        continue;
                     
                     _wordTable = new MWCHashtable(WORD_TABLE_SIZE);
                     if(!processData(connection, _wordTable, this))
                        continue;
                     
                     // Put data in db
                     synchronized(_connectionLock) {
                        if(docID < 0) {
                           s.executeUpdate("INSERT INTO documents VALUES (null, '" + url.toExternalForm() 
                                           + "', " + connection.getLastModified() + ", " 
                                           + connection.getContentLength() + ", '" 
                                           + encode(_blurb.toString()) + "')");
                           rs = s.executeQuery("SELECT LAST_INSERT_ID() As DocumentID");
                           rs.next();
                           docID = rs.getInt("DocumentID");
                           rs.close();
                        }
                        else {
                           s.executeUpdate("UPDATE documents SET url = '" 
                                           + url.toExternalForm() 
                                           + "', last_mod = " + connection.getLastModified() 
                                           + ", length = " + connection.getContentLength()
                                           + ", blurb = '" + encode(_blurb.toString()) + "' "
                                           + " WHERE id = " + docID);
                        }
                        Enumeration words, frequencys;
                        words = _wordTable.keys();
                        frequencys = _wordTable.elements();
                        String word;
                        Integer frequency;
                        int wordID;
                        while(words.hasMoreElements() 
                              && frequencys.hasMoreElements()) {
                           word = (String)words.nextElement();
                           frequency = (Integer)frequencys.nextElement();
                           if(_badWordTable.get(_currentHost.getHost().toLowerCase() 
                                                + "___" + word) != null)
                              continue;
                           rs = s.executeQuery("SELECT id FROM words WHERE word = '" + word + "'");
                           if(rs.next()) {
                              wordID = rs.getInt("id");
                              rs.close();
                           }
                           else {
                              rs.close();
                              s.executeUpdate("INSERT INTO words VALUES (null, '" + word 
                                              + "', SOUNDEX('" + word + "'))");
                              rs = s.executeQuery("SELECT LAST_INSERT_ID() As WordID");
                              rs.next();
                              wordID = rs.getInt("WordID");
                              rs.close();
                           }
                           s.executeUpdate("INSERT INTO word_frequencies VALUES ("
                                           +wordID+","+docID+","+frequency+")");
                        }
                     }
                  }
                  catch(Throwable th) {
                     _debug(th);
                  }
               }
            }
            finally {
               s.close();
            }
         }
         catch(SQLException sqle) {
            _debug(sqle);
         }
         finally {
            synchronized(_getDataTaskCountLock) {
               _getDataTaskCount--;
               _getDataTaskCountLock.notify();
            }
            synchronized(_lookingForURLLock) {
               _lookingForURLLock.notify();
            }
         }
      }
   }

   // Wait for the end of the spiders traversal
   public void waitForEnd() {
      // sleep to give it a chance to get started
      try {
         Thread.sleep(1000 * 5);
      }
      catch(InterruptedException ie) {
         ;
      }
      
      synchronized(_getDataTaskCountLock) { 
         while(_getDataTaskCount > 0) {
            try {
               _getDataTaskCountLock.wait();
            }
            catch(InterruptedException ie) {
               ;
            }
         }
      }
   }

   public boolean allow(HttpURLConnection connection) {
      if(_allowContentTypes == null) 
         return true;
      String ct = connection.getContentType();
      int i = ct.indexOf(';');
      if(i >= 0)
         ct = ct.substring(0,i);
      if(Collections.binarySearch(_allowContentTypes, ct) < 0)
         return false;
      return true;
   }

   public Indexer(List allowContentTypes,
                  Properties props)
   throws Throwable {
      this("com.mwc.wsearch.Indexer (V1.0.0)", allowContentTypes, props);
      int n = Integer.parseInt(_props.getProperty("indexer.sites"));
      URL url;
      _allow = new LinkedList();
      CSVFileReader csv;
      _badWordTable = new MWCHashtable(512);
      for(int i = 1; i <= n; i++) {
         url = new URL(_props.getProperty("indexer.site"+i+".url"));
         _allow.add(url.getHost());
         registerURL(url);
         csv = new CSVFileReader(_props.getProperty("indexer.site"+i+".badWordFile"), false);
         csv.open();
         while(csv.readLine())
            _badWordTable.put(url.getHost().toLowerCase() + "___" 
                              + csv.getValue(0).toLowerCase(), "");
         csv.close();
      }
      Collections.sort(_allow);
   }
   
   public void registerWord(URL doc,
                            String word, 
                            Dictionary wordTable) {
      Integer frequency = (Integer)wordTable.get(word);
      if(frequency == null)
         wordTable.put(word, new Integer(1));
      else
         wordTable.put(word, new Integer(1 + frequency.intValue()));
   }
   
   protected boolean processData(HttpURLConnection connection, 
                                 Dictionary wordTable, 
                                 GetDataTask task)
   throws IOException {
      try {
         if((connection.getResponseCode() != HttpURLConnection.HTTP_OK)
            || (connection.getContentType() == null)
            || !allow(connection)) {
            _debug("CONTENT-TYPE="+connection.getContentType());
            _debug("RESPONSE_CODE="+connection.getResponseCode());
            _debug("badURL(\""+ connection.getURL().toExternalForm() + "\")");
            return false;
         }
      }
      catch(Throwable th) {
         _debug(th);
         _debug("badURL(\""+ connection.getURL().toExternalForm() + "\")");
         return false;
      }
      _debug("["+Thread.currentThread().getName()+"]\n"
             +"PROCESS_DATA("+connection.getURL().toExternalForm()+")***");
      try {
         PageParser.parsePage(connection.getInputStream(), 
                              connection, this, task);
      }
      catch(ParseException pe) {
         _debug("["+Thread.currentThread().getName()+"]");
         _debug(pe);
         return false;
      }
      catch(REException re) {
         _debug("["+Thread.currentThread().getName()+"]");
         _debug(re);
         return false;
      }
      
      return true;
   }
   
   protected boolean visited(URL url) {
      String u = url.toExternalForm();
      int i = u.indexOf('#');
      if(i >= 0)
         u = u.substring(0,i);
      synchronized(_urlTable) {
         Integer status = (Integer)_urlTable.get(u);
         if(status == null)
            _urlTable.put(u, new Integer(0));
         else
            return true;
      }
      return false;
   }
   
   public URL selectNextURL() {
      String u;
      int taskCount;
      Enumeration elements, keys;
      Integer status;
      synchronized(_lookingForURLLock) {
         _lookingForURL++;
         _lookingForURLLock.notify();
      }
      try {
         while(true) {
            synchronized(_urlTable) {
               elements = _urlTable.elements();
               keys = _urlTable.keys();
               while(elements.hasMoreElements() 
                     && keys.hasMoreElements()) {
                  status = (Integer)elements.nextElement();
                  u = (String)keys.nextElement();
                  if(status.intValue() == 0) {
                     _debug("SET-STATUS("+u+")");
                     _urlTable.put(u, new Integer(1));
                     return new URL(u);
                  }
               }
            }

            synchronized(_getDataTaskCountLock) { 
               taskCount = _getDataTaskCount;
            }
            synchronized(_lookingForURLLock) {
               _debug("_lookingForURL == " + _lookingForURL + " taskCount = " + taskCount);
               if(_lookingForURL == taskCount) {
                  break;
               }
               else {
                  _lookingForURLLock.wait();
               }
            }
         }
      }
      catch(Throwable th) {
         _debug(th);
      }
      finally {
         synchronized(_lookingForURLLock) {
            _lookingForURL--;
         }
      }
      return null;
   }
   
   public static void main(String args[]) 
   throws Throwable {
      if(args.length != 1) {
         System.err.println("Usage: com.mwc.wsearch.Indexer <properties file name>");
         System.exit(1);
      }
      FileInputStream fin = new FileInputStream(args[0]);
      BufferedInputStream bin = new BufferedInputStream(fin);
      Properties props = new Properties();
      props.load(bin);
      fin.close();
      Class.forName(props.getProperty("indexer.db.driver")).newInstance();
      long start = System.currentTimeMillis();
      List allowContentTypes = new LinkedList();
      allowContentTypes.add("text/html");
      allowContentTypes.add("text/plain");
      Indexer spider = new Indexer(allowContentTypes, props);
      spider.start(Integer.parseInt(props.getProperty("indexer.threads")));
      spider.waitForEnd();
      long end = System.currentTimeMillis();
      long total = end - start;
      spider._debug("TOTAL_INDEXING_TIME="+((double)total / 1000.0)+"sec.");
   }   
}