org.apache.mahout.text
Class SequenceFilesFromMailArchives

java.lang.Object
  extended by org.apache.hadoop.conf.Configured
      extended by org.apache.mahout.common.AbstractJob
          extended by org.apache.mahout.text.SequenceFilesFromMailArchives
All Implemented Interfaces:
org.apache.hadoop.conf.Configurable, org.apache.hadoop.util.Tool

public final class SequenceFilesFromMailArchives
extends AbstractJob

Converts a directory of gzipped mail archives into SequenceFiles of specified chunkSize. This class is similar to SequenceFilesFromDirectory except it uses block-compressed SequenceFiles and parses out the subject and body text of each mail message into a separate key/value pair.


Field Summary
static String BASE_INPUT_PATH
           
static String[] BODY_OPTION
           
static String[] BODY_SEPARATOR_OPTION
           
static String[] CHARSET_OPTION
           
static String[] CHUNK_SIZE_OPTION
           
static String[] FROM_OPTION
           
static String[] KEY_PREFIX_OPTION
           
static String[] QUOTED_REGEX_OPTION
           
static String[] REFERENCES_OPTION
           
static String[] SEPARATOR_OPTION
           
static String[] STRIP_QUOTED_OPTION
           
static String[] SUBJECT_OPTION
           
static String[] TO_OPTION
           
 
Fields inherited from class org.apache.mahout.common.AbstractJob
argMap, inputFile, inputPath, outputFile, outputPath, tempPath
 
Constructor Summary
SequenceFilesFromMailArchives()
           
 
Method Summary
 void createSequenceFiles(MailOptions options)
           
static void main(String[] args)
           
 int run(String[] args)
           
 
Methods inherited from class org.apache.mahout.common.AbstractJob
addFlag, addInputOption, addOption, addOption, addOption, addOption, addOutputOption, buildOption, buildOption, getAnalyzerClassFromOption, getCLIOption, getConf, getDimensions, getFloat, getFloat, getGroup, getInputFile, getInputPath, getInt, getInt, getOption, getOption, getOption, getOptions, getOutputFile, getOutputPath, getOutputPath, getTempPath, getTempPath, hasOption, keyFor, maybePut, parseArguments, parseArguments, parseDirectories, prepareJob, prepareJob, prepareJob, prepareJob, setConf, setS3SafeCombinedInputPath, shouldRunNextPhase
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

CHUNK_SIZE_OPTION

public static final String[] CHUNK_SIZE_OPTION

KEY_PREFIX_OPTION

public static final String[] KEY_PREFIX_OPTION

CHARSET_OPTION

public static final String[] CHARSET_OPTION

SUBJECT_OPTION

public static final String[] SUBJECT_OPTION

TO_OPTION

public static final String[] TO_OPTION

FROM_OPTION

public static final String[] FROM_OPTION

REFERENCES_OPTION

public static final String[] REFERENCES_OPTION

BODY_OPTION

public static final String[] BODY_OPTION

STRIP_QUOTED_OPTION

public static final String[] STRIP_QUOTED_OPTION

QUOTED_REGEX_OPTION

public static final String[] QUOTED_REGEX_OPTION

SEPARATOR_OPTION

public static final String[] SEPARATOR_OPTION

BODY_SEPARATOR_OPTION

public static final String[] BODY_SEPARATOR_OPTION

BASE_INPUT_PATH

public static final String BASE_INPUT_PATH
See Also:
Constant Field Values
Constructor Detail

SequenceFilesFromMailArchives

public SequenceFilesFromMailArchives()
Method Detail

createSequenceFiles

public void createSequenceFiles(MailOptions options)
                         throws IOException
Throws:
IOException

main

public static void main(String[] args)
                 throws Exception
Throws:
Exception

run

public int run(String[] args)
        throws Exception
Throws:
Exception


Copyright © 2008–2014 The Apache Software Foundation. All rights reserved.