Commit a06ec81b authored by cs451's avatar cs451
Browse files

added CountWordSimple and updated pom.xml to use https. Readme updated to...

added CountWordSimple and updated pom.xml to use https. Readme updated to incude the new example and new data directory
parent aba9ffa3
......@@ -3,4 +3,3 @@
*.iml
*~
target/
data/
......@@ -14,15 +14,9 @@ Build the package:
$ mvn clean package
```
Grab the data:
Dataset:
```
$ mkdir data
$ curl http://lintool.github.io/bespin-data/Shakespeare.txt > data/Shakespeare.txt
$ curl http://lintool.github.io/bespin-data/p2p-Gnutella08-adj.txt > data/p2p-Gnutella08-adj.txt
```
The datasets are stored in the [Bespin data repo](https://github.com/lintool/bespin-data).
The datasets are stored in the the data directory.
+ The file `Shakespeare.txt` contains the [The Complete Works of William Shakespeare](http://www.gutenberg.org/ebooks/100) from [Project Gutenberg](http://www.gutenberg.org/).
+ The file `p2p-Gnutella08-adj.txt` contains a [snapshot of the Gnutella peer-to-peer file sharing network from August 2002](http://snap.stanford.edu/data/p2p-Gnutella08.html), where nodes represent hosts in the Gnutella network topology and edges represent connections between the Gnutella hosts. This dataset is available from the [Stanford Network Analysis Project](http://snap.stanford.edu/).
......@@ -30,7 +24,16 @@ The datasets are stored in the [Bespin data repo](https://github.com/lintool/bes
## Word Count in MapReduce and Spark
Make sure you've downloaded the Shakespeare collection (see "Getting Started" above). Running word count in Java MapReduce:
Running word count in Java MapReduce:
The "WordCountSimple" example is a simple MapReduce program that counts the number of times each word occures in the input text.
```
$ hadoop jar target/bespin-1.0.5-SNAPSHOT-fatjar.jar io.bespin.java.mapreduce.wordcount.WordCountSimple \
-input data/Shakespeare.txt -output wc-simple
```
The next example adds some optimizations such as regular combiners and in memory combiners:
```
$ hadoop jar target/bespin-1.0.5-SNAPSHOT-fatjar.jar io.bespin.java.mapreduce.wordcount.WordCount \
......
This diff is collapsed.
This diff is collapsed.
......@@ -18,9 +18,9 @@
</license>
</licenses>
<scm>
<connection>scm:git:git@github.com:lintool/bespin.git</connection>
<developerConnection>scm:git:git@github.com:lintool/bespin.git</developerConnection>
<url>git@github.com:lintool/bespin.git</url>
<connection>scm:git:ist-git@git.uwaterloo.ca:cs451/bespin.git</connection>
<developerConnection>scm:git:ist-git@git.uwaterloo.ca:cs451/bespin.git</developerConnection>
<url>ist-git@git.uwaterloo.ca:cs451/bespin.git</url>
</scm>
<developers>
<developer>
......@@ -28,6 +28,11 @@
<name>Jimmy Lin</name>
<email>jimmylin@uwaterloo.ca</email>
</developer>
<developer>
<id>cs451</id>
<name>Ali Abedi</name>
<email>ali.abedi@uwaterloo.ca</email>
</developer>
</developers>
<distributionManagement>
......@@ -40,7 +45,7 @@
<repositories>
<repository>
<id>maven</id>
<url>http://repo.maven.apache.org/maven2/</url>
<url>https://repo.maven.apache.org/maven2/</url>
</repository>
</repositories>
......
/**
* Bespin: reference implementations of "big data" algorithms
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.bespin.java.mapreduce.wordcount;
import io.bespin.java.util.Tokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.ParserProperties;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
/**
* Simple word count demo with no optimization. See WordCount.java for combiner optimization.
*/
public class WordCountSimple extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(WordCountSimple.class);
// Mapper: emits (token, 1) for every word occurrence.
public static final class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
// Reuse objects to save overhead of object creation.
private static final IntWritable ONE = new IntWritable(1);
private static final Text WORD = new Text();
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
for (String word : Tokenizer.tokenize(value.toString())) {
WORD.set(word);
context.write(WORD, ONE);
}
}
}
// Reducer: sums up all the counts.
public static final class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
// Reuse objects.
private static final IntWritable SUM = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
// Sum up values.
Iterator<IntWritable> iter = values.iterator();
int sum = 0;
while (iter.hasNext()) {
sum += iter.next().get();
}
SUM.set(sum);
context.write(key, SUM);
}
}
/**
* Creates an instance of this tool.
*/
private WordCountSimple() {}
private static final class Args {
@Option(name = "-input", metaVar = "[path]", required = true, usage = "input path")
String input;
@Option(name = "-output", metaVar = "[path]", required = true, usage = "output path")
String output;
@Option(name = "-reducers", metaVar = "[num]", usage = "number of reducers")
int numReducers = 1;
}
/**
* Runs this tool.
*/
@Override
public int run(String[] argv) throws Exception {
final Args args = new Args();
CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100));
try {
parser.parseArgument(argv);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
return -1;
}
LOG.info("Tool: " + WordCountSimple.class.getSimpleName());
LOG.info(" - input path: " + args.input);
LOG.info(" - output path: " + args.output);
LOG.info(" - number of reducers: " + args.numReducers);
Configuration conf = getConf();
Job job = Job.getInstance(conf);
job.setJobName(WordCountSimple.class.getSimpleName());
job.setJarByClass(WordCountSimple.class);
job.setNumReduceTasks(args.numReducers);
FileInputFormat.setInputPaths(job, new Path(args.input));
FileOutputFormat.setOutputPath(job, new Path(args.output));
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// Delete the output directory if it exists already.
Path outputDir = new Path(args.output);
FileSystem.get(conf).delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
return 0;
}
/**
* Dispatches command-line arguments to the tool via the {@code ToolRunner}.
*
* @param args command-line arguments
* @throws Exception if tool encounters an exception
*/
public static void main(String[] args) throws Exception {
ToolRunner.run(new WordCountSimple(), args);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment