/* Generated By:JavaCC: Do not edit this line. PageParser.java */
package com.mwc.wsearch;

import java.util.*;
import java.io.*;
import java.net.*;
import java.sql.*;

import gnu.regexp.*;

/**
 * Flexable web page parser for the indexer.
 *
 * @author Matthew W. Coan (6:15 PM 12/4/2002)
 */
public class PageParser implements PageParserConstants {
   private int _inCSS;
   private int _inJS;
   private RE _urlPattern;
   private HttpURLConnection _connection;
   private Indexer _ws;
   private Indexer.GetDataTask _gdTask;
   private StringBuffer _pageBlurb;
   private int _blurbWordCount;

   public static void parsePage(InputStream in,
                                HttpURLConnection connection,
                                Indexer ws,
                                Indexer.GetDataTask gd)
   throws ParseException, REException {
      PageParser parser = new PageParser(in);
      parser._gdTask = gd;
      parser._inCSS = parser._inJS = 0;
      parser._connection = connection;
      parser._ws = ws;
      parser._urlPattern = new RE("(((([hH][rR][eE][fF])|([sS][rR][cC]))\\=)"
                                  +"(\\\"|\\\')?"
                                  +"([a-zA-Z0-9\\_\\-\\.\\:\\;\\?\\&\\=\\%\\#\\$\\@\\+\\/\\~])+"
                                  +"(\\\"|\\\')?)");
      parser._pageBlurb = new StringBuffer();
      parser._blurbWordCount = 0;
      parser.OnePage();
   }

  final public void OnePage() throws ParseException {
  Token x; String word; Integer count;
    label_1:
    while (true) {
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case WS:
      case STAGO:
      case ETAGO:
      case TAGC:
      case WORD:
      case CHAR_ENT:
      case STYLE_TAG:
      case SCRIPT_TAG:
      case ANY_TAG:
      case DOCTYPE_TAG:
      case PROC_DIR:
      case END_SCRIPT_TAG:
      case END_STYLE_TAG:
      case ANY_END_TAG:
      case OTHER_DATA:
      case 16:
        ;
        break;
      default:
        jj_la1[0] = jj_gen;
        break label_1;
      }
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case WS:
        x = jj_consume_token(WS);
           if((_pageBlurb.length() == 0) || (!Character.isSpaceChar(_pageBlurb.charAt(_pageBlurb.length()-1))))
              _pageBlurb.append(x.image);
        break;
      case STYLE_TAG:
        x = jj_consume_token(STYLE_TAG);
           _inCSS++;
        break;
      case DOCTYPE_TAG:
        x = jj_consume_token(DOCTYPE_TAG);

        break;
      case PROC_DIR:
        x = jj_consume_token(PROC_DIR);

        break;
      case END_STYLE_TAG:
        x = jj_consume_token(END_STYLE_TAG);
           if(_inCSS > 0)
              _inCSS--;
        break;
      case SCRIPT_TAG:
        x = jj_consume_token(SCRIPT_TAG);
           _inJS++;
        break;
      case END_SCRIPT_TAG:
        x = jj_consume_token(END_SCRIPT_TAG);
           if(_inJS > 0)
              _inJS--;
        break;
      case ANY_TAG:
        x = jj_consume_token(ANY_TAG);
           REMatch matchArray[] = _urlPattern.getAllMatches(x.image);
           String url;
           int index;
           for(int i = 0; i < matchArray.length; i++) {
              url = matchArray[i].toString();
              index = url.indexOf('=');
              if((index < 0) || ((index+1) >= url.length()))
                 continue;
              url = url.substring(index+1);
              if(url.length() == 0)
                 continue;
              if(url.charAt(0) == '\"' || url.charAt(0) == '\'')
                 url = url.substring(1);
              if(url.length() == 0)
                 continue;
              if(url.charAt(url.length()-1) == '\"' || url.charAt(url.length()-1) == '\'')
                 url = url.substring(0, url.length()-1);
              try {
                 if(_ws != null && url.length() > 5 && url.substring(0,5).equalsIgnoreCase("http:"))
                    _ws.registerURL(new URL(url));
                 else if(_ws != null && url.indexOf(':') < 0)
                    _ws.registerURL(new URL(_connection.getURL(), url));
                 //System.out.println("URL("+url+")");
                 //System.out.flush();
              }
              catch(Throwable th) {
                 th.printStackTrace();
              }
           }

           if((_pageBlurb.length() == 0) || (!Character.isSpaceChar(_pageBlurb.charAt(_pageBlurb.length()-1))))
              _pageBlurb.append(" ");
        break;
      case ANY_END_TAG:
        x = jj_consume_token(ANY_END_TAG);
           //System.out.println("END_TAG("+x.image+")"); 
           if((_pageBlurb.length() == 0) || (!Character.isSpaceChar(_pageBlurb.charAt(_pageBlurb.length()-1))))
              _pageBlurb.append(" ");
        break;
      case WORD:
        x = jj_consume_token(WORD);
          //System.out.println("WORD0("+x.image+")"); 
           if(_inCSS == 0 && _inJS == 0) {
              _pageBlurb.append(x.image);
              word = x.image.toLowerCase();
              _ws.registerWord(_gdTask.currentHost(), word, _gdTask.wordTable());
              //System.out.println("WORD1("+x.image+")"); 
           }
        break;
      case CHAR_ENT:
        x = jj_consume_token(CHAR_ENT);
           //System.out.println("CHAR_ENT("+x.image+")"); 
           if((_pageBlurb.length() == 0) || (!Character.isSpaceChar(_pageBlurb.charAt(_pageBlurb.length()-1))))
              _pageBlurb.append(' ');
        break;
      case OTHER_DATA:
        x = jj_consume_token(OTHER_DATA);
           //System.out.println("OTHER_DATA("+x.image+")");
           if(_inCSS == 0 && _inJS == 0) {
              _pageBlurb.append(x.image);
           }
        break;
      case STAGO:
        jj_consume_token(STAGO);

        break;
      case TAGC:
        jj_consume_token(TAGC);

        break;
      case ETAGO:
        jj_consume_token(ETAGO);

        break;
      case 16:
        jj_consume_token(16);

        break;
      default:
        jj_la1[1] = jj_gen;
        jj_consume_token(-1);
        throw new ParseException();
      }
    }
    jj_consume_token(0);
      _gdTask.setBlurb(_pageBlurb);
  }

  public PageParserTokenManager token_source;
  SimpleCharStream jj_input_stream;
  public Token token, jj_nt;
  private int jj_ntk;
  private int jj_gen;
  final private int[] jj_la1 = new int[2];
  final private int[] jj_la1_0 = {0x1fffe,0x1fffe,};

  public PageParser(java.io.InputStream stream) {
    jj_input_stream = new SimpleCharStream(stream, 1, 1);
    token_source = new PageParserTokenManager(jj_input_stream);
    token = new Token();
    jj_ntk = -1;
    jj_gen = 0;
    for (int i = 0; i < 2; i++) jj_la1[i] = -1;
  }

  public void ReInit(java.io.InputStream stream) {
    jj_input_stream.ReInit(stream, 1, 1);
    token_source.ReInit(jj_input_stream);
    token = new Token();
    jj_ntk = -1;
    jj_gen = 0;
    for (int i = 0; i < 2; i++) jj_la1[i] = -1;
  }

  public PageParser(java.io.Reader stream) {
    jj_input_stream = new SimpleCharStream(stream, 1, 1);
    token_source = new PageParserTokenManager(jj_input_stream);
    token = new Token();
    jj_ntk = -1;
    jj_gen = 0;
    for (int i = 0; i < 2; i++) jj_la1[i] = -1;
  }

  public void ReInit(java.io.Reader stream) {
    jj_input_stream.ReInit(stream, 1, 1);
    token_source.ReInit(jj_input_stream);
    token = new Token();
    jj_ntk = -1;
    jj_gen = 0;
    for (int i = 0; i < 2; i++) jj_la1[i] = -1;
  }

  public PageParser(PageParserTokenManager tm) {
    token_source = tm;
    token = new Token();
    jj_ntk = -1;
    jj_gen = 0;
    for (int i = 0; i < 2; i++) jj_la1[i] = -1;
  }

  public void ReInit(PageParserTokenManager tm) {
    token_source = tm;
    token = new Token();
    jj_ntk = -1;
    jj_gen = 0;
    for (int i = 0; i < 2; i++) jj_la1[i] = -1;
  }

  final private Token jj_consume_token(int kind) throws ParseException {
    Token oldToken;
    if ((oldToken = token).next != null) token = token.next;
    else token = token.next = token_source.getNextToken();
    jj_ntk = -1;
    if (token.kind == kind) {
      jj_gen++;
      return token;
    }
    token = oldToken;
    jj_kind = kind;
    throw generateParseException();
  }

  final public Token getNextToken() {
    if (token.next != null) token = token.next;
    else token = token.next = token_source.getNextToken();
    jj_ntk = -1;
    jj_gen++;
    return token;
  }

  final public Token getToken(int index) {
    Token t = token;
    for (int i = 0; i < index; i++) {
      if (t.next != null) t = t.next;
      else t = t.next = token_source.getNextToken();
    }
    return t;
  }

  final private int jj_ntk() {
    if ((jj_nt=token.next) == null)
      return (jj_ntk = (token.next=token_source.getNextToken()).kind);
    else
      return (jj_ntk = jj_nt.kind);
  }

  private java.util.Vector jj_expentries = new java.util.Vector();
  private int[] jj_expentry;
  private int jj_kind = -1;

  final public ParseException generateParseException() {
    jj_expentries.removeAllElements();
    boolean[] la1tokens = new boolean[17];
    for (int i = 0; i < 17; i++) {
      la1tokens[i] = false;
    }
    if (jj_kind >= 0) {
      la1tokens[jj_kind] = true;
      jj_kind = -1;
    }
    for (int i = 0; i < 2; i++) {
      if (jj_la1[i] == jj_gen) {
        for (int j = 0; j < 32; j++) {
          if ((jj_la1_0[i] & (1<<j)) != 0) {
            la1tokens[j] = true;
          }
        }
      }
    }
    for (int i = 0; i < 17; i++) {
      if (la1tokens[i]) {
        jj_expentry = new int[1];
        jj_expentry[0] = i;
        jj_expentries.addElement(jj_expentry);
      }
    }
    int[][] exptokseq = new int[jj_expentries.size()][];
    for (int i = 0; i < jj_expentries.size(); i++) {
      exptokseq[i] = (int[])jj_expentries.elementAt(i);
    }
    return new ParseException(token, exptokseq, tokenImage);
  }

  final public void enable_tracing() {
  }

  final public void disable_tracing() {
  }

}
