import java.io.*;
import java.lang.*;
import java.sql.*;
import java.util.*;

/**
 *  Gets the frequency of each word for each movie.  This data will be used for clustering users.
 *
 * @author     Brandon Douthit-Wood
 * @created    March 31, 2004
 */
public class GetMovieWordFrequency {

	// hash tables to store words and frequencies
	private Hashtable wordHash, stopList, globalHash;

	/**Constructor for the GetMovieWordFrequency object */
	public GetMovieWordFrequency() {
		stopList = new Hashtable();
		globalHash = new Hashtable();
		wordHash = new Hashtable();

		if ( !Query.connectToDB() ) {
			System.exit( 0 );
		}
	}

	/**  Calculates frequencies for all words. */
	public void calculateWordFrequency() {
		String id;
		String words;
		String word;
		String query;
		String title;
		String newWords;
		Hashtable hash;
		Integer numWords;
		Integer globalNumWords;
		StringTokenizer token;
		BufferedReader input;
		Enumeration enum;

		// delimiters to remove from words
		String delims = " \t\n\r\f`~!@#$%^&*()_-+={[}]|\\<,>.?/:;\"'";

		while ( true ) {
			// continually loop through movies until all have been processed
			query = "select * from movie where word_freq='' limit 1000";
			ResultSet userResult = Query.executeQuery( query );
			if ( Query.getNumResults( userResult ) == 0 ) {
				return;
			}

			try {
				while ( userResult.next() ) {
					id = userResult.getString( "id" );
					words = userResult.getString( "words" );

					hash = new Hashtable();
					newWords = "";

					token = new StringTokenizer( words, delims );
					while ( token.hasMoreTokens() ) {
						word = token.nextToken();
						word = word.toLowerCase();

						// throw out any words in stopList
						if ( stopList.containsKey( word ) ) {
							continue;
						}
						// get rid of other junk
						if ( word.equals( "ii" ) || word.equals( "iii" ) || word.equals( "iv" )
							 || word.equals( "jr" ) || word.equals( "sr" ) || word.equals( "x" ) ) {
							continue;
						}

						// lookup the current word in hashtable
						numWords = (Integer) hash.get( word );
						// not in hashtable, add it
						if ( numWords == null ) {
							numWords = new Integer( 1 );
							hash.put( word, numWords );
						}
						// already in hashtable, increment frequency count
						else {
							numWords = new Integer( numWords.intValue() + 1 );
							hash.put( word, numWords );
						}
					}

					// calculate frequencies
					int size = hash.size();
					int num;
					enum = hash.keys();
					while ( enum.hasMoreElements() ) {
						word = (String) enum.nextElement();
						num = ( (Integer) hash.get( word ) ).intValue();

						double freq = (double) num / size;
						newWords += word + ":" + freq + ":";
					}

					// if word occurs less than 3 times, we will ignore it
					if ( newWords.length() < 3 ) {
						newWords = "x";
					}
					else {
						newWords = newWords.substring( 0, newWords.length() - 1 );
					}

					// update movie table
					query = "update movie set word_freq='" + newWords + "' where id=" + id;
					Query.executeUpdate( query );
					System.out.println( id );
				}
				userResult.close();
			}
			catch ( SQLException e ) {
				System.err.println( "Error parsing words..." );
				e.printStackTrace();
				System.exit( 0 );
			}
		}
	}

	/**
	 *  Reads the list of words from the stop list into a hash table - these are common words that
	 *  should be ignored since they do not provide much contextual information.
	 */
	public void readStopList() {
		String filename = "stoplist.txt";
		String word;
		BufferedReader input;

		try {
			// open stoplist file for reading
			input = new BufferedReader( new FileReader( filename ) );
			word = input.readLine();

			// add words to stoplist hashtable
			while ( word != null ) {
				stopList.put( word, word );
				word = input.readLine();
			}
			input.close();
		}
		catch ( FileNotFoundException e ) {
			System.err.println( "Could not find stoplist file: " + filename );
			e.printStackTrace();
			System.exit( 0 );
		}
		catch ( IOException e ) {
			System.err.println( "Error reading stoplist file: " + filename );
			e.printStackTrace();
			System.exit( 0 );
		}
	}

	/**
	 *  Gets the word frequencies for the movies
	 *
	 * @param  args  The command line arguments
	 */
	public static void main( String[] args ) {
		GetMovieWordFrequency words = new GetMovieWordFrequency();
		words.readStopList();
		System.out.println( "getting word frequencies..." );
		words.calculateWordFrequency();
	}
}

