Reading and Writing Sequencefile using Hadoop 2.0 Apis

Tags:

2 Answers

public class SequenceFilesTest {
  @Test
  public void testSeqFileReadWrite() throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Path seqFilePath = new Path("file.seq");
    SequenceFile.Writer writer = SequenceFile.createWriter(conf,
            Writer.file(seqFilePath), Writer.keyClass(Text.class),
            Writer.valueClass(IntWritable.class));

    writer.append(new Text("key1"), new IntWritable(1));
    writer.append(new Text("key2"), new IntWritable(2));

    writer.close();

    SequenceFile.Reader reader = new SequenceFile.Reader(conf,
            Reader.file(seqFilePath));

    Text key = new Text();
    IntWritable val = new IntWritable();

    while (reader.next(key, val)) {
        System.err.println(key + "\t" + val);
    }

    reader.close();
  }
}

answered Oct 11 '22 15:10

Chris White

I'm late by more than an year to answer but just got started with Hadoop 2.4.1 :)

Below is the code, someone may find it useful.

Note: It includes the commented 1.x code to read and write a sequence file. I was wondering where does it pick up the file system but when I executed it directly on the cluster, it picked it properly(probably, from core-site.xml as mentioned in Configuration

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;

public class SequenceFileOperator {

    private Configuration conf = new Configuration();
    /*private FileSystem fs;
    {
        try {
            fs = FileSystem.get(URI.create("hdfs://cldx-1336-1202:9000"), conf);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }*/

    public static void main(String[] args) throws IOException {
        // TODO Auto-generated method stub

        if (args == null || args.length < 2) {

            System.out
                    .println("Following are the possible invocations <operation id> <arg1> <arg2> ...");

            System.out
                    .println("1 <absolute path of directory containing documents> <HDFS path of the sequence file");

            System.out.println("2 <HDFS path of the sequence file>");
            return;
        }

        int operation = Integer.valueOf(args[0]);

        SequenceFileOperator docToSeqFileWriter = new SequenceFileOperator();

        switch (operation) {

        case 1: {
            String docDirectoryPath = args[1];
            String sequenceFilePath = args[2];

            System.out.println("Writing files present at " + docDirectoryPath
                    + " to the sequence file " + sequenceFilePath);

            docToSeqFileWriter.loadDocumentsToSequenceFile(docDirectoryPath,
                    sequenceFilePath);

            break;
        }

        case 2: {

            String sequenceFilePath = args[1];

            System.out.println("Reading the sequence file " + sequenceFilePath);

            docToSeqFileWriter.readSequenceFile(sequenceFilePath);

            break;
        }

        }

    }

    private void readSequenceFile(String sequenceFilePath) throws IOException {
        // TODO Auto-generated method stub

        /*
         * SequenceFile.Reader sequenceFileReader = new SequenceFile.Reader(fs,
         * new Path(sequenceFilePath), conf);
         */
        Option filePath = SequenceFile.Reader.file(new Path(sequenceFilePath));
        SequenceFile.Reader sequenceFileReader = new SequenceFile.Reader(conf,
                filePath);

        Writable key = (Writable) ReflectionUtils.newInstance(
                sequenceFileReader.getKeyClass(), conf);
        Writable value = (Writable) ReflectionUtils.newInstance(
                sequenceFileReader.getValueClass(), conf);

        try {

            while (sequenceFileReader.next(key, value)) {

                System.out
                        .printf("[%s] %s %s \n",
                                sequenceFileReader.getPosition(), key,
                                value.getClass());
            }
        } finally {
            IOUtils.closeStream(sequenceFileReader);
        }

    }

    private void loadDocumentsToSequenceFile(String docDirectoryPath,
            String sequenceFilePath) throws IOException {
        // TODO Auto-generated method stub

        File docDirectory = new File(docDirectoryPath);

        if (!docDirectory.isDirectory()) {
            System.out
                    .println("Please provide an absolute path of a directory that contains the documents to be added to the sequence file");
            return;
        }

        /*
         * SequenceFile.Writer sequenceFileWriter =
         * SequenceFile.createWriter(fs, conf, new Path(sequenceFilePath),
         * Text.class, BytesWritable.class);
         */
        org.apache.hadoop.io.SequenceFile.Writer.Option filePath = SequenceFile.Writer
                .file(new Path(sequenceFilePath));
        org.apache.hadoop.io.SequenceFile.Writer.Option keyClass = SequenceFile.Writer
                .keyClass(Text.class);
        org.apache.hadoop.io.SequenceFile.Writer.Option valueClass = SequenceFile.Writer
                .valueClass(BytesWritable.class);

        SequenceFile.Writer sequenceFileWriter = SequenceFile.createWriter(
                conf, filePath, keyClass, valueClass);

        File[] documents = docDirectory.listFiles();

        try {
            for (File document : documents) {

                RandomAccessFile raf = new RandomAccessFile(document, "r");
                byte[] content = new byte[(int) raf.length()];

                raf.readFully(content);

                sequenceFileWriter.append(new Text(document.getName()),
                        new BytesWritable(content));

                raf.close();
            }
        } finally {
            IOUtils.closeStream(sequenceFileWriter);
        }

    }
}

answered Oct 11 '22 14:10

Kaliyug Antagonist

Related questions
                            
                                “Combiner" Class in a mapreduce job
                            
                                Dropping multiple tables with same prefix in Hive
                            
                                Is Snappy splittable or not splittable?
                            
                                Aggregate Resource Allocation for a job in YARN
                            
                                Passing arguments to Hadoop mappers
                            
                                Apache Helix vs YARN
                            
                                Error: Java heap space
                            
                                Checking if directory in HDFS already exists or not
                            
                                Loading data from one Hive table to another with partition
                            
                                Hadoop: Python client driver for HiveServer2 fails to install
                            
                                Deleting file/folder from Hadoop
                            
                                Hive: dynamic partition adding to external table
                            
                                Overriding default hadoop jars in class path
                            
                                Amazon Emr - What is the need of Task nodes when we have Core nodes?
                            
                                Hadoop, Mahout real-time processing alternative
                            
                                Slow transfers in Jetty with chunked transfer encoding at certain buffer size
                            
                                hbase cannot find an existing table
                            
                                Rstudio-server environment variables not loading?
                            
                                What is the fastest way to bulk load data into HBase programmatically?
                            
                                Accessing Hue on Cloudera Docker QuickStart

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Reading and Writing Sequencefile using Hadoop 2.0 Apis

Tags:

hadoop

user 923227

People also ask

2 Answers

Chris White

Kaliyug Antagonist

Recent Activity

Donate For Us