- Make Sure that the Hadoop server is running
hadoop fs -mkdir /mahout_data
hadoop fs -mkdir /kmeans_output
hadoop fs -mkdir /mahout_seq
- Verify the created directories using:
hadoop fs -ls
hadoop fs -put ./keyVal.txt /mahout_data/
- Note: sample.txt file is in hadoop/data/ directory of this repo.
mahout seqdirectory \
-i /mahout_data \
-o /mahout_seq \
-ow
mahout seq2sparse -i /mahout_seq/ -o /mahout_sparse/ -ow
mahout canopy -i /mahout_sparse/tf-vectors -o /canopy_output/ \
-dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -t1 10 -t2 20 -ow
mahout kmeans -i /mahout_sparse/tfidf-vectors \
-c /canopy_output \
-o /kmeans_output \
-dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -x 2 -k 30 -ow