org.apache.mahout.clustering.lda.cvb
Class TopicModel
java.lang.Object
org.apache.mahout.clustering.lda.cvb.TopicModel
- All Implemented Interfaces:
- Iterable<MatrixSlice>, org.apache.hadoop.conf.Configurable
public class TopicModel
- extends Object
- implements org.apache.hadoop.conf.Configurable, Iterable<MatrixSlice>
Thin wrapper around a Matrix
of counts of occurrences of (topic, term) pairs. Dividing
{code topicTermCount.viewRow(topic).get(term)} by the sum over the values for all terms in that
row yields p(term | topic). Instead dividing it by all topic columns for that term yields
p(topic | term).
Multithreading is enabled for the update(Matrix)
method: this method is async, and
merely submits the matrix to a work queue. When all work has been submitted,
awaitTermination()
should be called, which will block until updates have been
accumulated.
Constructor Summary |
TopicModel(org.apache.hadoop.conf.Configuration conf,
double eta,
double alpha,
String[] dictionary,
int numThreads,
double modelWeight,
org.apache.hadoop.fs.Path... modelpath)
|
TopicModel(int numTopics,
int numTerms,
double eta,
double alpha,
Random random,
String[] dictionary,
int numThreads,
double modelWeight)
|
TopicModel(int numTopics,
int numTerms,
double eta,
double alpha,
String[] dictionary,
double modelWeight)
|
TopicModel(int numTopics,
int numTerms,
double eta,
double alpha,
String[] dictionary,
int numThreads,
double modelWeight)
|
TopicModel(Matrix topicTermCounts,
double eta,
double alpha,
String[] dictionary,
int numThreads,
double modelWeight)
|
TopicModel(Matrix topicTermCounts,
Vector topicSums,
double eta,
double alpha,
String[] dictionary,
double modelWeight)
|
TopicModel(Matrix topicTermCounts,
Vector topicSums,
double eta,
double alpha,
String[] dictionary,
int numThreads,
double modelWeight)
|
Method Summary |
org.apache.hadoop.conf.Configuration |
getConf()
|
int |
getNumTerms()
|
int |
getNumTopics()
|
Vector |
infer(Vector original,
Vector docTopics)
|
Iterator<MatrixSlice> |
iterator()
|
static Pair<Matrix,Vector> |
loadModel(org.apache.hadoop.conf.Configuration conf,
org.apache.hadoop.fs.Path... modelPaths)
|
double |
perplexity(Vector document,
Vector docTopics)
\(sum_x sum_a (c_ai * log(p(x|i) * p(a|x)))\) |
void |
persist(org.apache.hadoop.fs.Path outputDir,
boolean overwrite)
|
void |
renormalize()
|
void |
reset()
|
int |
sampleTerm(int topic)
|
int |
sampleTerm(Vector topicDistribution)
|
void |
setConf(org.apache.hadoop.conf.Configuration configuration)
|
void |
stop()
|
Vector |
topicSums()
|
String |
toString()
|
void |
trainDocTopicModel(Vector original,
Vector topics,
Matrix docTopicModel)
|
void |
update(int termId,
Vector topicCounts)
|
void |
update(Matrix docTopicCounts)
|
void |
updateTopic(int topic,
Vector docTopicCounts)
|
static String |
vectorToSortedString(Vector vector,
String[] dictionary)
|
TopicModel
public TopicModel(int numTopics,
int numTerms,
double eta,
double alpha,
String[] dictionary,
double modelWeight)
TopicModel
public TopicModel(org.apache.hadoop.conf.Configuration conf,
double eta,
double alpha,
String[] dictionary,
int numThreads,
double modelWeight,
org.apache.hadoop.fs.Path... modelpath)
throws IOException
- Throws:
IOException
TopicModel
public TopicModel(int numTopics,
int numTerms,
double eta,
double alpha,
String[] dictionary,
int numThreads,
double modelWeight)
TopicModel
public TopicModel(int numTopics,
int numTerms,
double eta,
double alpha,
Random random,
String[] dictionary,
int numThreads,
double modelWeight)
TopicModel
public TopicModel(Matrix topicTermCounts,
Vector topicSums,
double eta,
double alpha,
String[] dictionary,
double modelWeight)
TopicModel
public TopicModel(Matrix topicTermCounts,
double eta,
double alpha,
String[] dictionary,
int numThreads,
double modelWeight)
TopicModel
public TopicModel(Matrix topicTermCounts,
Vector topicSums,
double eta,
double alpha,
String[] dictionary,
int numThreads,
double modelWeight)
getNumTerms
public int getNumTerms()
getNumTopics
public int getNumTopics()
iterator
public Iterator<MatrixSlice> iterator()
- Specified by:
iterator
in interface Iterable<MatrixSlice>
topicSums
public Vector topicSums()
loadModel
public static Pair<Matrix,Vector> loadModel(org.apache.hadoop.conf.Configuration conf,
org.apache.hadoop.fs.Path... modelPaths)
throws IOException
- Throws:
IOException
toString
public String toString()
- Overrides:
toString
in class Object
sampleTerm
public int sampleTerm(Vector topicDistribution)
sampleTerm
public int sampleTerm(int topic)
reset
public void reset()
stop
public void stop()
renormalize
public void renormalize()
trainDocTopicModel
public void trainDocTopicModel(Vector original,
Vector topics,
Matrix docTopicModel)
infer
public Vector infer(Vector original,
Vector docTopics)
update
public void update(Matrix docTopicCounts)
updateTopic
public void updateTopic(int topic,
Vector docTopicCounts)
update
public void update(int termId,
Vector topicCounts)
persist
public void persist(org.apache.hadoop.fs.Path outputDir,
boolean overwrite)
throws IOException
- Throws:
IOException
perplexity
public double perplexity(Vector document,
Vector docTopics)
- \(sum_x sum_a (c_ai * log(p(x|i) * p(a|x)))\)
vectorToSortedString
public static String vectorToSortedString(Vector vector,
String[] dictionary)
setConf
public void setConf(org.apache.hadoop.conf.Configuration configuration)
- Specified by:
setConf
in interface org.apache.hadoop.conf.Configurable
getConf
public org.apache.hadoop.conf.Configuration getConf()
- Specified by:
getConf
in interface org.apache.hadoop.conf.Configurable
Copyright © 2008–2014 The Apache Software Foundation. All rights reserved.