Hadoop-Mapreduce实战(辅助排序和二次排序案例（GroupingComparator）)

辅助排序和二次排序案例（GroupingComparator）

需求

有如下订单

订单id	商品id	成交金额
0000001	Pdt_01	222.8
0000001	Pdt_06	25.8
0000002	Pdt_03	522.8
0000002	Pdt_04	122.4
0000002	Pdt_05	722.4
0000003	Pdt_01	222.8
0000003	Pdt_02	33.8

现在需要求出每一个订单中最贵的商品。
输入数据

输出数据预期
分析
- 利用“订单id和成交金额”作为key，可以将map阶段读取到的所有订单数据按照id分区，按照金额排序，发送到reduce。
- 在reduce端利用groupingcomparator将订单id相同的kv聚合成组，然后取第一个即是最大值。
  4）代码实现
定义订单信息OrderBean

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

@Getter
@Setter
@AllArgsConstructor
@NoArgsConstructor
public class OrderBean implements WritableComparable<OrderBean> {
   
    // 订单id号
    private int order_id;
    // 价格
    private double price;

    @Override
    public String toString() {
   
        return order_id + "\t" + price;
    }

    @Override
    public int compareTo(OrderBean o) {
   
        int result;

        if (this.order_id > o.getOrder_id()) {
   
            result = 1;
        } else if (this.order_id < o.getOrder_id()) {
   
            result = -1;
        } else {
   
            result = this.price > o.getPrice() ? -1 : 1;
        }
        return result;
    }

    @Override
    public void write(DataOutput out) throws IOException {
   
        out.writeInt(order_id);
        out.writeDouble(price);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
   
        in.readInt();
        in.readDouble();
    }
}

编写OrderSortMapper

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class OrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> {
   
	OrderBean k = new OrderBean();
	
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
   
		
		// 1 获取一行
		String line = value.toString();
		
		// 2 截取
		String[] fields = line.split("\t");
		
		// 3 封装对象
		k.setOrder_id(Integer.parseInt(fields[0]));
		k.setPrice(Double.parseDouble(fields[2]));
		
		// 4 写出
		context.write(k, NullWritable.get());
	}
}

编写OrderSortPartitioner

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class OrderPartitioner extends Partitioner<OrderBean, NullWritable> {
   

	@Override
	public int getPartition(OrderBean key, NullWritable value, int numReduceTasks) {
   
		
		return (key.getOrder_id() & Integer.MAX_VALUE) % numReduceTasks;
	}
}

编写OrderSortGroupingComparator

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class OrderGroupingComparator extends WritableComparator {
   

	protected OrderGroupingComparator() {
   
		super(OrderBean.class, true);
	}

	@SuppressWarnings("rawtypes")
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
   

		OrderBean aBean = (OrderBean) a;
		OrderBean bBean = (OrderBean) b;

		int result;
		if (aBean.getOrder_id() > bBean.getOrder_id()) {
   
			result = 1;
		} else if (aBean.getOrder_id() < bBean.getOrder_id()) {
   
			result = -1;
		} else {
   
			result = 0;
		}

		return result;
	}
}

编写OrderSortReducer

import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class OrderReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable> {
   

	@Override
	protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context)
			throws IOException, InterruptedException {
   
		
		context.write(key, NullWritable.get());
	}
}

编写OrderSortDriver

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OrderDriver {
   

	public static void main(String[] args) throws Exception {
   

		// 1 获取配置信息
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		// 2 设置jar包加载路径
		job.setJarByClass(OrderDriver.class);

		// 3 加载map/reduce类
		job.setMapperClass(OrderMapper.class);
		job.setReducerClass(OrderReducer.class);

		// 4 设置map输出数据key和value类型
		job.setMapOutputKeyClass(OrderBean.class);
		job.setMapOutputValueClass(NullWritable.class);

		// 5 设置最终输出数据的key和value类型
		job.setOutputKeyClass(OrderBean.class);
		job.setOutputValueClass(NullWritable.class);

		// 6 设置输入数据和输出数据路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 10 设置reduce端的分组
		job.setGroupingComparatorClass(OrderGroupingComparator.class);

		// 7 设置分区
		job.setPartitionerClass(OrderPartitioner.class);

		// 8 设置reduce个数
		job.setNumReduceTasks(3);

		// 9 提交
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 : 1);
	}
}

小文件处理（自定义InputFormat）

需求

无论hdfs还是mapreduce，对于小文件都有损效率，实践中，又难免面临处理大量小文件的场景，此时，就需要有相应解决方案。将多个小文件合并成一个文件SequenceFile，SequenceFile里面存储着多个文件，存储的形式为文件路径+名称为key，文件内容为value。
输入数据

最终预期文件格式：
分析

小文件的优化无非以下几种方式：
- 在数据采集的时候，就将小文件或小批数据合成大文件再上传HDFS
- 在业务处理之前，在HDFS上使用mapreduce程序对小文件进行合并
- 在mapreduce处理时，可采用CombineTextInputFormat提高效率
具体实现

本节采用自定义InputFormat的方式，处理输入小文件的问题。
- 自定义一个类继承FileInputFormat
- 改写RecordReader，实现一次读取一个完整文件封装为KV
- 在输出时使用SequenceFileOutPutFormat输出合并文件
程序实现
- 自定义InputFromat

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

// 定义类继承FileInputFormat
public class WholeFileInputformat extends FileInputFormat<NullWritable, BytesWritable>{
   
	
	@Override
	protected boolean isSplitable(JobContext context, Path filename) {
   
		return false;
	}

	@Override
	public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
   
		
		WholeRecordReader recordReader = new WholeRecordReader();
		recordReader.initialize(split, context);
		
		return recordReader;
	}
}

自定义RecordReader

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeRecordReader extends RecordReader<NullWritable, BytesWritable>{
   

	private Configuration configuration;
	private FileSplit split;
	
	private boolean processed = false;
	private BytesWritable value = new BytesWritable();
	
	@Override
	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
   
		
		this.split = (FileSplit)split;
		configuration = context.getConfiguration();
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
   
		
		if (!processed) {
   
			// 1 定义缓存区
			byte[] contents = new byte[(int)split.getLength()];
			
			FileSystem fs = null;
			FSDataInputStream fis = null;
			
			try {
   
				// 2 获取文件系统
				Path path = split.getPath();
				fs = path.getFileSystem(configuration);
				
				// 3 读取数据
				fis = fs.open(path);
				
				// 4 读取文件内容
				IOUtils.readFully(fis, contents, 0, contents.length);
				
				// 5 输出文件内容
				value.set(contents, 0, contents.length);
			} catch (Exception e) {
   
				
			}finally {
   
				IOUtils.closeStream(fis);
			}
			
			processed = true;
			
			return true;
		}
		
		return false;
	}

	@Override
	public NullWritable getCurrentKey() throws IOException, InterruptedException {
   
		return NullWritable.get();
	}

	@Override
	public BytesWritable getCurrentValue() throws IOException, InterruptedException {
   
		return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
   
		return processed? 1:0;
	}

	@Override
	public void close() throws IOException {
   
	}
}

SequenceFileMapper处理流程

import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class SequenceFileReducer extends Reducer<Text, BytesWritable, Text, BytesWritable> {
   

	@Override
	protected void reduce(Text key, Iterable<BytesWritable> values, Context context)
			throws IOException, InterruptedException {
   

		context.write(key, values.iterator().next());
	}
}

SequenceFileDriver处理流程

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

public class SequenceFileDriver {
   

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
   
		
		args = new String[] {
    "e:/input/inputinputformat", "e:/output1" };
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);
		job.setJarByClass(SequenceFileDriver.class);
		job.setMapperClass(SequenceFileMapper.class);
		job.setReducerClass(SequenceFileReducer.class);

        // 设置输入的inputFormat
		job.setInputFormatClass(WholeFileInputformat.class);
        // 设置输出的outputFormat
		job.setOutputFormatClass(SequenceFileOutputFormat.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(BytesWritable.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(BytesWritable.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean result = job.waitForCompletion(true);

		System.exit(result ? 0 : 1);
	}
}