/* * . * API org.apache.hadoop.mapreduce.* */ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; /* * LongWritable - ( ). * Text - ( ). * Text - ( ). * Text - ( ). */ public class RegexMapper extends Mapper<LongWritable, Text, Text, Text>{ private Pattern pattern; private Text keyOut; // . /* * setup() map()( ). * map() . */ @Override public void setup(Context context) throws IOException{ /* * ( ), * Driver-( ). */ pattern = Pattern.compile(context.getConfiguration().get("regex")); /* (valueIn). */ Path filePath = ((FileSplit) context.getInputSplit()).getPath(); keyOut = new Text(filePath.toString()); } /* * map() . . - * (keyOut - setup() * ) (valueIn - * ). */ @Override public void map(LongWritable key, Text valueIn, Context context) throws IOException, InterruptedException { Matcher matcher = pattern.matcher(valueIn.toString()); /* * , - * , . */ if (matcher.find()) context.write(keyOut, valueIn); // } }
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /* Text */ public class RegexReducer extends Reducer<Text, Text, Text, Text> { /* . */ @Override public void reduce(Text keyIn, Iterable<Text> valuesIn, Context context) throws IOException, InterruptedException { /* StringBuilder. */ StringBuilder valueOut = new StringBuilder(); for(Text value: valuesIn) valueOut.append("\n" + value.toString()); valueOut.append("\n"); context.write(keyIn, new Text(valueOut.toString())); } }
import com.petrez.mappers.RegexMapper; import com.petrez.reducers.RegexReducer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; public class Grep { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { if(args.length != 3) { System.out.println("Usage: <inDir> <outDir> <regex>"); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } Configuration config = new Configuration(); /* map() regex. */ config.set("regex", args[2]); Job job = new Job(config, "grep"); /* * jar- * . */ job.setJarByClass(Grep.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); /* * . TextInputFormat * map . * "\n". */ job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(RegexMapper.class); job.setReducerClass(RegexReducer.class); job.waitForCompletion(true); } }
< hadoop>/bin/hadoop jar /home/hduser/HadoopGrep.jar < > < > < >
Source: https://habr.com/ru/post/189798/
All Articles