org.apache.mahout.common
Class HadoopUtil

java.lang.Object
  extended by org.apache.mahout.common.HadoopUtil

public final class HadoopUtil
extends Object


Method Summary
static String buildDirList(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.FileStatus fileStatus)
          Builds a comma-separated list of input splits
static String buildDirList(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.FileStatus fileStatus, org.apache.hadoop.fs.PathFilter pathFilter)
          Builds a comma-separated list of input splits
static void cacheFiles(org.apache.hadoop.fs.Path fileToCache, org.apache.hadoop.conf.Configuration conf)
           
static String calcRelativeFilePath(org.apache.hadoop.conf.Configuration configuration, org.apache.hadoop.fs.Path filePath)
           
static long countRecords(org.apache.hadoop.fs.Path path, org.apache.hadoop.conf.Configuration conf)
           
static long countRecords(org.apache.hadoop.fs.Path path, PathType pt, org.apache.hadoop.fs.PathFilter filter, org.apache.hadoop.conf.Configuration conf)
          Count all the records in a directory using a SequenceFileDirValueIterator
static void delete(org.apache.hadoop.conf.Configuration conf, Iterable<org.apache.hadoop.fs.Path> paths)
           
static void delete(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path... paths)
           
static org.apache.hadoop.fs.Path[] getCachedFiles(org.apache.hadoop.conf.Configuration conf)
          Retrieves paths to cached files.
static String getCustomJobName(String className, org.apache.hadoop.mapreduce.JobContext job, Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper, Class<? extends org.apache.hadoop.mapreduce.Reducer> reducer)
           
static org.apache.hadoop.fs.FileStatus[] getFileStatus(org.apache.hadoop.fs.Path path, PathType pathType, org.apache.hadoop.fs.PathFilter filter, Comparator<org.apache.hadoop.fs.FileStatus> ordering, org.apache.hadoop.conf.Configuration conf)
           
static org.apache.hadoop.fs.Path getSingleCachedFile(org.apache.hadoop.conf.Configuration conf)
          Return the first cached file in the list, else null if thre are no cached files.
static org.apache.hadoop.fs.FileStatus[] listStatus(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path path)
           
static org.apache.hadoop.fs.FileStatus[] listStatus(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path path, org.apache.hadoop.fs.PathFilter filter)
           
static InputStream openStream(org.apache.hadoop.fs.Path path, org.apache.hadoop.conf.Configuration conf)
           
static org.apache.hadoop.mapreduce.Job prepareJob(org.apache.hadoop.fs.Path inputPath, org.apache.hadoop.fs.Path outputPath, Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat, Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper, Class<? extends org.apache.hadoop.io.Writable> mapperKey, Class<? extends org.apache.hadoop.io.Writable> mapperValue, Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat, org.apache.hadoop.conf.Configuration conf)
          Create a map-only Hadoop Job out of the passed in parameters.
static org.apache.hadoop.mapreduce.Job prepareJob(org.apache.hadoop.fs.Path inputPath, org.apache.hadoop.fs.Path outputPath, Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat, Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper, Class<? extends org.apache.hadoop.io.Writable> mapperKey, Class<? extends org.apache.hadoop.io.Writable> mapperValue, Class<? extends org.apache.hadoop.mapreduce.Reducer> reducer, Class<? extends org.apache.hadoop.io.Writable> reducerKey, Class<? extends org.apache.hadoop.io.Writable> reducerValue, Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat, org.apache.hadoop.conf.Configuration conf)
          Create a map and reduce Hadoop job.
static int readInt(org.apache.hadoop.fs.Path path, org.apache.hadoop.conf.Configuration configuration)
           
static void setSerializations(org.apache.hadoop.conf.Configuration configuration)
           
static void writeInt(int value, org.apache.hadoop.fs.Path path, org.apache.hadoop.conf.Configuration configuration)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Method Detail

prepareJob

public static org.apache.hadoop.mapreduce.Job prepareJob(org.apache.hadoop.fs.Path inputPath,
                                                         org.apache.hadoop.fs.Path outputPath,
                                                         Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat,
                                                         Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper,
                                                         Class<? extends org.apache.hadoop.io.Writable> mapperKey,
                                                         Class<? extends org.apache.hadoop.io.Writable> mapperValue,
                                                         Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat,
                                                         org.apache.hadoop.conf.Configuration conf)
                                                  throws IOException
Create a map-only Hadoop Job out of the passed in parameters. Does not set the Job name.

Throws:
IOException
See Also:
getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class)

prepareJob

public static org.apache.hadoop.mapreduce.Job prepareJob(org.apache.hadoop.fs.Path inputPath,
                                                         org.apache.hadoop.fs.Path outputPath,
                                                         Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat,
                                                         Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper,
                                                         Class<? extends org.apache.hadoop.io.Writable> mapperKey,
                                                         Class<? extends org.apache.hadoop.io.Writable> mapperValue,
                                                         Class<? extends org.apache.hadoop.mapreduce.Reducer> reducer,
                                                         Class<? extends org.apache.hadoop.io.Writable> reducerKey,
                                                         Class<? extends org.apache.hadoop.io.Writable> reducerValue,
                                                         Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat,
                                                         org.apache.hadoop.conf.Configuration conf)
                                                  throws IOException
Create a map and reduce Hadoop job. Does not set the name on the job.

Parameters:
inputPath - The input Path
outputPath - The output Path
inputFormat - The InputFormat
mapper - The Mapper class to use
mapperKey - The Writable key class. If the Mapper is a no-op, this value may be null
mapperValue - The Writable value class. If the Mapper is a no-op, this value may be null
reducer - The Reducer to use
reducerKey - The reducer key class.
reducerValue - The reducer value class.
outputFormat - The OutputFormat.
conf - The Configuration to use.
Returns:
The Job.
Throws:
IOException - if there is a problem with the IO.
See Also:
getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class), prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, Class, Class, Class, Class, Class, org.apache.hadoop.conf.Configuration)

getCustomJobName

public static String getCustomJobName(String className,
                                      org.apache.hadoop.mapreduce.JobContext job,
                                      Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper,
                                      Class<? extends org.apache.hadoop.mapreduce.Reducer> reducer)

delete

public static void delete(org.apache.hadoop.conf.Configuration conf,
                          Iterable<org.apache.hadoop.fs.Path> paths)
                   throws IOException
Throws:
IOException

delete

public static void delete(org.apache.hadoop.conf.Configuration conf,
                          org.apache.hadoop.fs.Path... paths)
                   throws IOException
Throws:
IOException

countRecords

public static long countRecords(org.apache.hadoop.fs.Path path,
                                org.apache.hadoop.conf.Configuration conf)
                         throws IOException
Throws:
IOException

countRecords

public static long countRecords(org.apache.hadoop.fs.Path path,
                                PathType pt,
                                org.apache.hadoop.fs.PathFilter filter,
                                org.apache.hadoop.conf.Configuration conf)
                         throws IOException
Count all the records in a directory using a SequenceFileDirValueIterator

Parameters:
path - The Path to count
pt - The PathType
filter - Apply the PathFilter. May be null
conf - The Hadoop Configuration
Returns:
The number of records
Throws:
IOException - if there was an IO error

openStream

public static InputStream openStream(org.apache.hadoop.fs.Path path,
                                     org.apache.hadoop.conf.Configuration conf)
                              throws IOException
Throws:
IOException

getFileStatus

public static org.apache.hadoop.fs.FileStatus[] getFileStatus(org.apache.hadoop.fs.Path path,
                                                              PathType pathType,
                                                              org.apache.hadoop.fs.PathFilter filter,
                                                              Comparator<org.apache.hadoop.fs.FileStatus> ordering,
                                                              org.apache.hadoop.conf.Configuration conf)
                                                       throws IOException
Throws:
IOException

listStatus

public static org.apache.hadoop.fs.FileStatus[] listStatus(org.apache.hadoop.fs.FileSystem fs,
                                                           org.apache.hadoop.fs.Path path)
                                                    throws IOException
Throws:
IOException

listStatus

public static org.apache.hadoop.fs.FileStatus[] listStatus(org.apache.hadoop.fs.FileSystem fs,
                                                           org.apache.hadoop.fs.Path path,
                                                           org.apache.hadoop.fs.PathFilter filter)
                                                    throws IOException
Throws:
IOException

cacheFiles

public static void cacheFiles(org.apache.hadoop.fs.Path fileToCache,
                              org.apache.hadoop.conf.Configuration conf)

getSingleCachedFile

public static org.apache.hadoop.fs.Path getSingleCachedFile(org.apache.hadoop.conf.Configuration conf)
                                                     throws IOException
Return the first cached file in the list, else null if thre are no cached files.

Parameters:
conf - - MapReduce Configuration
Returns:
Path of Cached file
Throws:
IOException - - IO Exception

getCachedFiles

public static org.apache.hadoop.fs.Path[] getCachedFiles(org.apache.hadoop.conf.Configuration conf)
                                                  throws IOException
Retrieves paths to cached files.

Parameters:
conf - - MapReduce Configuration
Returns:
Path[] of Cached Files
Throws:
IOException - - IO Exception
IllegalStateException - if no cache files are found

setSerializations

public static void setSerializations(org.apache.hadoop.conf.Configuration configuration)

writeInt

public static void writeInt(int value,
                            org.apache.hadoop.fs.Path path,
                            org.apache.hadoop.conf.Configuration configuration)
                     throws IOException
Throws:
IOException

readInt

public static int readInt(org.apache.hadoop.fs.Path path,
                          org.apache.hadoop.conf.Configuration configuration)
                   throws IOException
Throws:
IOException

buildDirList

public static String buildDirList(org.apache.hadoop.fs.FileSystem fs,
                                  org.apache.hadoop.fs.FileStatus fileStatus)
                           throws IOException
Builds a comma-separated list of input splits

Parameters:
fs - - File System
fileStatus - - File Status
Returns:
list of directories as a comma-separated String
Throws:
IOException - - IO Exception

buildDirList

public static String buildDirList(org.apache.hadoop.fs.FileSystem fs,
                                  org.apache.hadoop.fs.FileStatus fileStatus,
                                  org.apache.hadoop.fs.PathFilter pathFilter)
                           throws IOException
Builds a comma-separated list of input splits

Parameters:
fs - - File System
fileStatus - - File Status
pathFilter - - path filter
Returns:
list of directories as a comma-separated String
Throws:
IOException - - IO Exception

calcRelativeFilePath

public static String calcRelativeFilePath(org.apache.hadoop.conf.Configuration configuration,
                                          org.apache.hadoop.fs.Path filePath)
                                   throws IOException
Parameters:
configuration - - configuration
filePath - - Input File Path
Returns:
relative file Path
Throws:
IOException - - IO Exception


Copyright © 2008–2014 The Apache Software Foundation. All rights reserved.