MapReduce的倒排索引
索引:
什么是索引:索引(Index)是帮助数据库高效获取数据的数据结构。索引是在基于数据库表创建的,它包含一个表中某些列的值以及记录对应的地址,并且把这些值存储在一个数据结构中。最常见的就是使用哈希表、B+树作为索引。
索引的具体分析:
用代码说事,先来看看我的数据吧:
包com.huhu.day05;import java.io.IOException;导入org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import com.huhu.day04.ProgenyCount;公共类InvertedIndex扩展ToolRunner实现工具{ 私人配置conf; 公共静态类MyMapper扩展Mapper{ 私人FileSplit拆分; private Text va = new Text(); @覆盖 保护无效设置(Mapper .Context上下文) 抛出IOException,InterruptedException { split =(FileSplit)context.getInputSplit(); } @覆盖 protected void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException { String [] line = value.toString()。split(“”); 通信System.err.println(线); String filename = split.getPath()。getName(); for(String s:line){ va.set(“fileName:”+ filename +“:”+ key.get()+“\ t索引位置:”+ value.toString()。indexOf(s)+“\ t”); context.write(new Text(“搜索词:”+ s +“\ r”),new Text(va)); } } } 公共静态类MyReduce扩展Reducer <文本,文本,文本,文本> { @覆盖 保护无效设置(上下文上下文)抛出IOException,InterruptedException { } @覆盖 protected void reduce(Text key,Iterable values,Context context) 抛出IOException,InterruptedException { StringBuffer sb = new StringBuffer(); for(Text v:values){ sb.append(v.toString()); } context.write(new Text(key),new Text(sb.toString())); } @覆盖 保护无效清理(上下文上下文)抛出IOException,InterruptedException { } } 公共静态无效的主要(字符串[]参数)抛出异常{ InvertedIndex t = new InvertedIndex(); 配置conf = t.getConf(); String [] other = new GenericOptionsParser(conf,args).getRemainingArgs(); if(other.length!= 2){ System.err.println(“number is fail”); } int run = ToolRunner.run(conf,t,args); System.exit(运行); } @覆盖 public Configuration getConf(){ if(conf!= null){ 返回conf; } 返回新的配置(); } @覆盖 public void setConf(Configuration arg0){ } @覆盖 公共诠释运行(字符串[]其他)抛出异常{ 配置con = getConf(); Job job = Job.getInstance(con); job.setJarByClass(ProgenyCount.class); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //默认分区 // job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job,new Path(“hdfs:// ry-hadoop1:8020 / in / day05 / InvertedIndex”)); Path path = new Path(“hdfs:// ry-hadoop1:8020 / out / day05.txt”); FileSystem fs = FileSystem.get(getConf()); if(fs.exists(path)){ fs.delete(path,true); } FileOutputFormat.setOutputPath(job,path); 返回job.waitForCompletion(true)?0:1; }} 文本,文本,文本,文本>
索引很重要:
详情: