August | 2012 | BioInCloud

Archive for August, 2012

How to split a large BAM file by alignment gap

Posted in Bioinformatics, JAVA/J2EE on August 31, 2012| Leave a Comment »

/**
*You may copy, adapt, and redistribute this program for non-commercial use. *There is no warranty for the program. The entire risk as to the quality and *performance of the program is with you. Should the program prove defective, *you assume the cost of all necessary servicing, repair or correction.
*/

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.zip.Inflater;
import java.util.zip.Deflater;
import java.util.zip.CRC32;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A class to split BAM. 
 * @author Zhengqiu Cai
 * @version 1.00 6/2012
 */

//split a large bam file by alignment gap so you can do analysis in paralel such as calling variants
public class SplitBamByGap{
static byte [] head = null;//make the array large enough to fit the bam head
private static byte [] pipe_buffer = null;
private static int pipe_buffer_length = 0;
//current position in the unzipped BAM binary file from original BAM file
private static long bam_position = 0;
private static long bam_length = 0;
private static int pipe_buffer_start = 0;
private static int split_by_chromosome = 0;
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.out.println(“Usage: java ”
+ SplitBamByGap.class.getName()
+ ” <input file> [split size (65536Kb by default) or split by chromosome (-chr)]”);
System.exit(1);
}

String inPutBamFile = args[0];
int hdfs_block_size = 65536000;//64M by default
if (args.length > 1){
Pattern patt = Pattern.compile(“[0-9]+”);
Matcher m = patt.matcher(args[1]);
if (m.matches()){
hdfs_block_size = Integer.parseInt(args[1]);
}else if (args[1].matches(“-chr”)){
split_by_chromosome = 1;
}
}

long mark = 0;
//current position in the original BAM file
long bgzf_position = 0;
//bgzf position of the current bgzf block
long previous_bgzf_position = 0;
//size (to the end of unzipped BGZF block) of a part of a BGZF block which will be the first block of the will-be made new BAM file
int part_previous_size = 0;
//size (the first part in unzipped BGZF block) of a part of a BGZF block which will be the final block of the will-be made new BAM file
int part_next_size = 0;
long previous_bam_length = 0;

int bgzf_block_size = 0;
int previous_converted_refID = -1;
DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(inPutBamFile)));

int first_seq_pos = 0;
int last_seq_pos = 0;

//begin to read bam header
if (pipe_buffer_length == pipe_buffer_start) {
bgzf_block_size = readBGZFBlock(in,pipe_buffer_start,pipe_buffer_length);
bgzf_position += bgzf_block_size;
}

byte [] magic = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+4);
pipe_buffer_start += 4;
System.out.println(“>magic:”+(new String((new String(magic)).toCharArray())));
bam_position += 4;
byte [] l_text = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+4);
pipe_buffer_start += 4;
int converted_l_text = bigToLittleEndian(byteArrayToInt(l_text,0));
bam_position += 4;
byte [] text = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+converted_l_text);
pipe_buffer_start += converted_l_text;
bam_position += converted_l_text;
byte [] n_ref = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+4);
pipe_buffer_start += 4;
int converted_n_ref = bigToLittleEndian(byteArrayToInt(n_ref,0));
bam_position += 4;

for (int i=0;i<converted_n_ref;i++){

byte [] l_name = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+4);
pipe_buffer_start += 4;
int converted_l_name = bigToLittleEndian(byteArrayToInt(l_name,0));
bam_position += 4;
byte [] name = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+converted_l_name);
pipe_buffer_start += converted_l_name;
bam_position += converted_l_name;
byte [] l_ref = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+4);
pipe_buffer_start += 4;
int converted_l_ref = bigToLittleEndian(byteArrayToInt(l_ref,0));
bam_position += 4;
}
part_previous_size = (int)(bam_length – bam_position);//do not include bam head for the first split, the bam head will be added in ReadBGZFBlock.java

if (pipe_buffer_length == pipe_buffer_start) {

bgzf_block_size = readBGZFBlock(in,pipe_buffer_start,pipe_buffer_length);
bgzf_position += bgzf_block_size;
}
int converted_block_size = 0;
while (pipe_buffer_length > pipe_buffer_start){

int flag = 0;
//a bgzf block span within the first 4 bytes of the next bgzf block
if (bam_position + 4 > bam_length){

previous_bam_length = bam_length;
bgzf_block_size = readBGZFBlock(in,pipe_buffer_start,pipe_buffer_length);
bgzf_position += bgzf_block_size;
flag = 1;
}

byte [] block_size = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+4);
pipe_buffer_start += 4;
converted_block_size = bigToLittleEndian(byteArrayToInt(block_size,0));
bam_position += 4;
bam_position += converted_block_size;

if (bam_position > bam_length){
previous_bam_length = bam_length;
bgzf_block_size = readBGZFBlock(in,pipe_buffer_start,pipe_buffer_length);
bgzf_position += bgzf_block_size;
flag = 1;
}
byte [] refID = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+4);
pipe_buffer_start += 4;
int converted_refID = bigToLittleEndian(byteArrayToInt(refID,0));

byte [] pos = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, pipe_buffer_start+4);
pipe_buffer_start += 4;
int converted_pos = bigToLittleEndian(byteArrayToInt(pos,0));
first_seq_pos = converted_pos;

byte [] t = Arrays.copyOfRange(pipe_buffer, pipe_buffer_start, (pipe_buffer_start+converted_block_size-8));
pipe_buffer_start += converted_block_size-8;

if (pipe_buffer_length == pipe_buffer_start) {
previous_bam_length = bam_length;
bgzf_block_size = readBGZFBlock(in,pipe_buffer_start,pipe_buffer_length);
bgzf_position += bgzf_block_size;
flag = 1;
}

if (split_by_chromosome == 1){//split by chromosome
if ((previous_converted_refID != converted_refID)&&(converted_refID != -1)&&(previous_converted_refID != -1)){//if a read is not mapped, converted_refID == -1
System.out.println(previous_converted_refID+” “+converted_refID);
if (flag == 1){

}else{
part_next_size = (int)(bam_position – previous_bam_length) – converted_block_size – 4;
createBam(inPutBamFile,previous_bgzf_position,part_previous_size,(bgzf_position-bgzf_block_size),part_next_size);
previous_bgzf_position = bgzf_position – bgzf_block_size;
part_previous_size = (int)(bam_length – bam_position)+converted_block_size+4;
mark = bgzf_position;
}

}
}
else if ((first_seq_pos – last_seq_pos > 200)&&(previous_converted_refID == converted_refID)){//split by size
if(bgzf_position – mark >= hdfs_block_size){
if (flag == 1){

}else{
part_next_size = (int)(bam_position – previous_bam_length) – converted_block_size – 4;
System.out.println(previous_bgzf_position+”\t”+part_previous_size+”\t”+(bgzf_position-bgzf_block_size)+”\t”+part_next_size);
createBam(inPutBamFile,previous_bgzf_position,part_previous_size,(bgzf_position-bgzf_block_size),part_next_size);
previous_bgzf_position = bgzf_position – bgzf_block_size;
part_previous_size = (int)(bam_length – bam_position)+converted_block_size+4;
mark = bgzf_position;
}
}
}
last_seq_pos = first_seq_pos;
previous_converted_refID = converted_refID;
}//end while
part_next_size = (int)(bam_position – previous_bam_length);
createBam(inPutBamFile,previous_bgzf_position,part_previous_size,(bgzf_position-bgzf_block_size),part_next_size);
}//end main()

private static int readBGZFBlock(DataInputStream in, int buffer_position, int buffer_length){
int bsize = 0;
int total_len = 0;
pipe_buffer_start = 0;
try{
byte ID1 = in.readByte();
//System.out.println(“>ID1:”+new Short((short) (ID1 & 0x00ff)).toString()+”<“);

byte ID2 = in.readByte();
//System.out.println(“>ID2:”+new Short((short) (ID2 & 0x00ff)).toString()+”<“);

byte CM = in.readByte();
//System.out.println(“>CM:”+new Short((short) (CM & 0x00ff)).toString()+”<“);

byte FLG = in.readByte();
//System.out.println(“>FLG:”+new Short((short) (FLG & 0x00ff)).toString()+”<“);

int MTIME = in.readInt();
//int converted_MTIME = bigToLittleEndian(MTIME);
//System.out.println(“>MTIME:”+new Long((long) (converted_MTIME & 0x00000000ffffffffl)).toString()+”<“);

byte XFL = in.readByte();
//System.out.println(“>XFL:”+new Short((short) (XFL & 0x00ff)).toString()+”<“);

byte OS = in.readByte();
//System.out.println(“>OS:”+new Short((short) (OS & 0x00ff)).toString()+”<“);
/*
byte [] head = new byte[10];
in.read(head);
*/
short XLEN = in.readShort();
short converted_XLEN = bigToLittleEndianShort(XLEN);
int xlen = (int) converted_XLEN & 0x0000ffff;
//System.out.println(“>XLEN:”+xlen+”<“);

byte [] subfield = new byte[4];
in.read(subfield);

short BSIZE = in.readShort();
short converted_BSIZE = bigToLittleEndianShort(BSIZE);
bsize = (int) (converted_BSIZE & 0x0000ffff);
//System.out.println(“>BSIZE:”+bsize+”<“);

//process compressed contents
byte [] CDATA = new byte[bsize-xlen-19];
int r = in.read(CDATA);

//process the remaining zip metadata
int CRC32 = in.readInt();

int ISIZE = in.readInt();
int converted_ISIZE = bigToLittleEndian(ISIZE);//unzip compressed contents using inflate method

Inflater decompresser = new Inflater(true);//must use true here, since by default BAM do not include the zlib header
decompresser.setInput(CDATA);

byte[] content = new byte[converted_ISIZE];

if (buffer_position == buffer_length) {
pipe_buffer_length = decompresser.inflate(content);
bam_length += pipe_buffer_length;
pipe_buffer = content;
}else{
int j = 0;
pipe_buffer_length = decompresser.inflate(content);
bam_length += pipe_buffer_length;
byte [] concatenate = new byte[pipe_buffer_length + buffer_length – buffer_position];
for (int i=buffer_position;i<buffer_length;i++){
concatenate[j] = pipe_buffer[i];
j++;
}

for (int i=0;i<pipe_buffer_length;i++){
concatenate[j] = content[i];
j++;
}
pipe_buffer = concatenate;
pipe_buffer_length = pipe_buffer_length + buffer_length – buffer_position;
}
decompresser.end();
}
catch(Exception e){}
return bsize+1;//return bgzf block size, which is equal to bsize+1
}

private static void createBam(String input_bam_file, long pre_bgzf_pos, int first_pos, long bgzf_pos,int next_pos){
DataInputStream in = null;
FileOutputStream fos = null;
File outf = null;
try{
in = new DataInputStream(new BufferedInputStream(new FileInputStream(input_bam_file)));

outf = new File(new File(input_bam_file).getName()+”.”+pre_bgzf_pos+”_”+first_pos+”_”+bgzf_pos+”_”+next_pos+”.bam”);//store tmp files
fos = new FileOutputStream(outf);

int bsize = 0;
int count = 0;
long current_bgzf_pos = pre_bgzf_pos;
in.skip(pre_bgzf_pos);
while(true){
if (count == 0){
//System.out.println(“from ReadBGZFBlock.java: write the bam head block”);
//the first bgzf block, which contains only the bam dead
//write gzip head for bam head
int bam_head_len = readBamHead(input_bam_file);
//System.out.println(“bam_head_len: “+bam_head_len);
byte [] compressed_bam_head = new byte[bam_head_len];
Deflater compresserHead = new Deflater(Deflater.DEFAULT_COMPRESSION,true);//BAM does not use wrap,so must use this constructor here
compresserHead.setInput(head,0,bam_head_len);
compresserHead.finish();
int out_len2 = compresserHead.deflate(compressed_bam_head);
compresserHead.end();//should not use compresser.finish();
//System.out.println(“compressed length: “+out_len2);

byte [] bgzf_head = new byte[] {0x1f,(byte)0x8b,(byte)0x08,0x04};
fos.write(bgzf_head);

byte [] otherbgzf_head= new byte[] {0x00,0x00,0x00,0x00,0x00,(byte)0xff,0x06,0x00,0x42,0x43};
fos.write(otherbgzf_head);

byte [] SLEN_head = new byte[] {0x02,0x00};
fos.write(SLEN_head);

byte [] add_BZISE_head = intToByteArray(out_len2+19+6,2);//6 is XLEN
fos.write(add_BZISE_head);

fos.write(compressed_bam_head,0,out_len2);//write bam head content

CRC32 checksum_head = new CRC32();
checksum_head.update(head,0,bam_head_len);//computer original umcompressed data checksum, not compressed data
byte [] add_CRC32_head = intToByteArray((int)checksum_head.getValue());//calculate the CRC32
fos.write(add_CRC32_head);

byte [] add_ISIZE_head = intToByteArray(bam_head_len);
fos.write(add_ISIZE_head);

//the second bgzf block, which contains the first partial bgzf block
byte [] other = new byte[10];
in.read(other);

byte [] XLEN = new byte[2];
in.read(XLEN);
short converted_XLEN = bigToLittleEndianShort(byteArrayToShort(XLEN,0));
int xlen = (int) converted_XLEN & 0x0000ffff;
//System.out.println(“>XLEN:”+xlen+”<“);

byte [] sub = new byte[4];
in.read(sub);

byte [] BSIZE = new byte[2];
in.read(BSIZE);
short converted_BSIZE = bigToLittleEndianShort(byteArrayToShort(BSIZE,0));
bsize = (int) (converted_BSIZE & 0x0000ffff);
current_bgzf_pos = current_bgzf_pos + bsize + 1;

//process compressed contents
byte [] CDATA = new byte[bsize-xlen-19];
in.read(CDATA);

//process the remaining zip metadata
byte [] CRC32 = new byte[4];
in.read(CRC32);
//int converted_CRC32 = bigToLittleEndian(CRC32);
//System.out.println(“>CRC32:”+new Long((long) (converted_CRC32 & 0x00000000ffffffffl)).toString()+”<“);

byte [] ISIZE = new byte[4];
in.read(ISIZE);
int converted_ISIZE = bigToLittleEndian(byteArrayToInt(ISIZE,0));
//System.out.println(“>ISIZE:”+new Long((long) (converted_ISIZE & 0x00000000ffffffffl)).toString()+”<“);

//unzip compressed contents using inflate method
Inflater decompresser = new Inflater(true);//must use true here, since by default BAM do not include the zlib header
decompresser.setInput(CDATA);
byte[] content = new byte[converted_ISIZE];
decompresser.inflate(content);
decompresser.end();
byte [] first_part = new byte[first_pos];

first_part = Arrays.copyOfRange(content, converted_ISIZE-first_pos, converted_ISIZE);Deflater compresser = new Deflater(Deflater.DEFAULT_COMPRESSION,true);//BAM does not use wrap,so must use this constructor here

byte [] buffer = new byte[converted_ISIZE];
compresser.setInput(first_part);
compresser.finish();

int out_len = compresser.deflate(buffer);
compresser.end();//should not use compresser.finish();

if (first_pos != 0){//the first bgzf block contains empty, skip this section, otherwise an empty bam block will be written which indicates the end of the bam file
//write gzip header
byte [] bgzf = new byte[] {0x1f,(byte)0x8b,(byte)0x08,0x04};
fos.write(bgzf);

byte [] otherbgzf= new byte[] {0x00,0x00,0x00,0x00,0x00,(byte)0xff,0x06,0x00,0x42,0x43};
fos.write(otherbgzf);

byte [] SLEN = new byte[] {0x02,0x00};
fos.write(SLEN);

byte [] add_BZISE = intToByteArray(out_len+19+6,2);//6 is XLEN
fos.write(add_BZISE);

fos.write(buffer,0,out_len);//write bam content

CRC32 checksum = new CRC32();
checksum.update(first_part);//computer original umcompressed data checksum, not compressed data
byte [] add_CRC32 = intToByteArray((int)checksum.getValue());//calculate the CRC32
fos.write(add_CRC32);

byte [] add_ISIZE = intToByteArray(first_part.length);
fos.write(add_ISIZE);
}

count = 1;

//System.exit(0);
}
else if (count == 1){
if (current_bgzf_pos < bgzf_pos){
byte [] other = new byte[10];
in.read(other);
fos.write(other);

byte [] XLEN = new byte[2];
in.read(XLEN);
fos.write(XLEN);
short converted_XLEN = bigToLittleEndianShort(byteArrayToShort(XLEN,0));
int xlen = (int) converted_XLEN & 0x0000ffff;byte [] sub = new byte[4];

in.read(sub);
fos.write(sub);

byte [] BSIZE = new byte[2];
in.read(BSIZE);
fos.write(BSIZE);
short converted_BSIZE = bigToLittleEndianShort(byteArrayToShort(BSIZE,0));
bsize = (int) (converted_BSIZE & 0x0000ffff);
current_bgzf_pos = current_bgzf_pos + bsize + 1;

//process compressed contents
byte [] CDATA = new byte[bsize-xlen-19];
in.read(CDATA);
fos.write(CDATA);

//process the remaining zip metadata
byte [] CRC32 = new byte[4];
in.read(CRC32);
fos.write(CRC32);
byte [] ISIZE = new byte[4];

in.read(ISIZE);
fos.write(ISIZE);
int converted_ISIZE = bigToLittleEndian(byteArrayToInt(ISIZE,0));}

if(current_bgzf_pos == bgzf_pos){
count = 2;
}
}else{
//the final bgzf block, which contains the next partial bgzf block
byte [] other = new byte[10];
in.read(other);

byte [] XLEN = new byte[2];
in.read(XLEN);
short converted_XLEN = bigToLittleEndianShort(byteArrayToShort(XLEN,0));
int xlen = (int) converted_XLEN & 0x0000ffff;byte [] sub = new byte[4];

in.read(sub);

//process compressed contents
byte [] CDATA = new byte[bsize-xlen-19];
in.read(CDATA);

//process the remaining zip metadata
byte [] CRC32 = new byte[4];
in.read(CRC32);

byte [] ISIZE = new byte[4];
in.read(ISIZE);
int converted_ISIZE = bigToLittleEndian(byteArrayToInt(ISIZE,0));//unzip compressed contents using inflate method

Inflater decompresser = new Inflater(true);//must use true here, since by default BAM do not include the zlib header
decompresser.setInput(CDATA);
byte[] content = new byte[converted_ISIZE];
decompresser.inflate(content);
decompresser.end();
byte [] next_part = new byte[next_pos];
next_part = Arrays.copyOfRange(content, 0, next_pos);Deflater compresser = new Deflater(Deflater.DEFAULT_COMPRESSION,true);

byte [] buffer = new byte[converted_ISIZE];
compresser.setInput(next_part);
compresser.finish();
int out_len = compresser.deflate(buffer);
compresser.end();//should not use compresser.finish();

if (next_pos != 0){//the last bgzf block contains empty, skip this section, otherwise an empty bam block will be written which indicates the end of the bam file
//write gzip header
byte [] bgzf = new byte[] {0x1f,(byte)0x8b,(byte)0x08,0x04};
fos.write(bgzf);
byte [] otherbgzf= new byte[] {0x00,0x00,0x00,0x00,0x00,(byte)0xff,0x06,0x00,0x42,0x43};
fos.write(otherbgzf);
byte [] SLEN = new byte[] {0x02,0x00};
fos.write(SLEN);

byte [] add_BZISE = intToByteArray(out_len+19+6,2);//6 is XLEN
fos.write(add_BZISE);

fos.write(buffer,0,out_len);//write bam content

CRC32 checksum = new CRC32();
checksum.update(next_part);//computer original umcompressed data checksum, not compressed data
byte [] add_CRC32 = intToByteArray((int)checksum.getValue());//calculate the CRC32
fos.write(add_CRC32);

byte [] add_ISIZE = intToByteArray(next_part.length);
fos.write(add_ISIZE);
}
byte [] EOF = new byte[] {(byte)0x1F,(byte)0x8B,(byte)8,(byte)4,0,0,0,0,0,(byte)0xFF,6,0,(byte)0x42,(byte)0x43,2,0,(byte)0x1B,0,3,0,0,0,0,0,0,0,0,0};
fos.write(EOF);
break;
}
}
}catch(Exception e){}
finally{
if (in != null){
try{
in.close();
}catch(Exception ex){}
}
if (fos != null){
try{
fos.close();
}catch(Exception ex2){}
}
}
}

private static int readBamHead(String bamFile){
int bam_head_len = 0;
try{
DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(bamFile)));

//read the first bgzf block
byte [] other = new byte[10];
in.read(other);

byte [] XLEN = new byte[2];
in.read(XLEN);
short converted_XLEN = bigToLittleEndianShort(byteArrayToShort(XLEN,0));
int xlen = (int) converted_XLEN & 0x0000ffff;byte [] sub = new byte[4];

in.read(sub);

byte [] BSIZE = new byte[2];
in.read(BSIZE);
short converted_BSIZE = bigToLittleEndianShort(byteArrayToShort(BSIZE,0));
int bsize = (int) (converted_BSIZE & 0x0000ffff);//process compressed contents

byte [] CDATA = new byte[bsize-xlen-19];
int r = in.read(CDATA);

//process the remaining zip metadata
byte [] CRC32 = new byte[4];
in.read(CRC32);
byte [] ISIZE = new byte[4];

in.read(ISIZE);
int converted_ISIZE = bigToLittleEndian(byteArrayToInt(ISIZE,0));//unzip compressed contents using inflate method

/////////////
byte [] magic = Arrays.copyOfRange(content, bam_head_len, bam_head_len+4);
bam_head_len += 4;

byte [] l_text = Arrays.copyOfRange(content, bam_head_len, bam_head_len+4);
bam_head_len += 4;
int converted_l_text = bigToLittleEndian(byteArrayToInt(l_text,0));

byte [] text = Arrays.copyOfRange(content, bam_head_len, bam_head_len+converted_l_text);
bam_head_len += converted_l_text;

byte [] n_ref = Arrays.copyOfRange(content, bam_head_len, bam_head_len+4);
bam_head_len += 4;
int converted_n_ref = bigToLittleEndian(byteArrayToInt(n_ref,0));

for (int i=0;i<converted_n_ref;i++){
byte [] l_name = Arrays.copyOfRange(content, bam_head_len, bam_head_len+4);
bam_head_len += 4;
int converted_l_name = bigToLittleEndian(byteArrayToInt(l_name,0));

byte [] name = Arrays.copyOfRange(content, bam_head_len, bam_head_len+converted_l_name);
bam_head_len += converted_l_name;

byte [] l_ref = Arrays.copyOfRange(content, bam_head_len, bam_head_len+converted_l_name);
bam_head_len += 4;
}
in.close();
}catch(Exception e){}
return bam_head_len;
}

public static int bigToLittleEndian(int bigendian) {
ByteBuffer buf = ByteBuffer.allocate(4);

buf.order(ByteOrder.BIG_ENDIAN);
buf.putInt(bigendian);

buf.order(ByteOrder.LITTLE_ENDIAN);
return buf.getInt(0);
}

public static short bigToLittleEndianShort(short bigendian) {
ByteBuffer buf = ByteBuffer.allocate(2);

buf.order(ByteOrder.BIG_ENDIAN);
buf.putShort(bigendian);

buf.order(ByteOrder.LITTLE_ENDIAN);
return buf.getShort(0);
}

public static int byteArrayToInt(byte[] b, int offset) {
int value = 0;
for (int i = 0; i < 4; i++) {
int shift = (4 – 1 – i) * 8;
value += (b[i + offset] & 0x000000FF) << shift;
}
return value;
}

public static short byteArrayToShort(byte[] b,int offset) {
int value = 0;
for (int i = 0; i < 2; i++) {
int shift = (2 – 1 – i) * 8;
value += (b[i + offset] & 0x000000FF) << shift;
}
return (short)value;
}

public static byte [] intToByteArray(int number){//little endian
byte [] byteArray = new byte[4];
byteArray[3] = (byte)((number >> 24) & 0xFF);
byteArray[2] = (byte)((number >> 16) & 0xFF);
byteArray[1] = (byte)((number >> 8) & 0xFF);
byteArray[0] = (byte)(number & 0xFF);
return byteArray;
}

public static byte [] intToByteArray(int number, int len){//little endian
byte [] byteArray = new byte[2];
byteArray[1] = (byte)((number >> 8) & 0xFF);
byteArray[0] = (byte)(number & 0xFF);
return byteArray;
}
}

Read Full Post »

upload data on a machine out of Hadoop cluster to HDFS

Posted in Bioinformatics, Hadoop, JAVA/J2EE on August 21, 2012| Leave a Comment »

If data are on a hadoop datanode or namenode, we can use hadoop fs -put or hadoop fs -copyFromLocal to upload the data to HDFS. But, if data are on a machine out of the Hadoop cluster, we cannot use the above command lines.

This java program solves the problem of uploading data directly to HDFS from a remote machine, and it is not necessary to download the data to a hadoop node and then upload them HDFSusing hadoop fs -put or hadoop fs -copyFromLocal .

To run it, the most current version of Java is required(The old versions of java have a bug and max file size supported is 4Gb. If the file to be transfered is larger than 4Gb, an exception will occur.).

import java.net.URL;
import java.net.URI;
import java.net.URLConnection;
import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.net.MalformedURLException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.fs.permission.FsPermission;

class RemoteUploadData {
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.err.println(“Usage: java RemoteUploadData <URL_data_source> <full_path_file_name_on_HDFS> <hdfs_name_URI> <replication factor>”);
System.err.println(“For example : java RemoteUploadData ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/pilot2_high_cov_GRCh37_bams/data/NA19240/alignment/NA19240.chrom1.SOLID.bfast.YRI.high_coverage.20100311.bam /user/root/input/test_input.bam hdfs://ip-10-224-53-114.ec2.internal:50001″);
System.err.println(“This will upload the file from the ftp site to your hadoop file system.”);
System.exit(2);
}

try {
//String data_file = “ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/pilot2_high_cov_GRCh37_bams/data/NA19240/alignment/NA19240.chrom1.SOLID.bfast.YRI.high_coverage.20100311.bam“;
String data_file = args[0];
URL url = new URL(data_file);
URLConnection con = url.openConnection();
long fileSize = Long.parseLong(con.getHeaderField(“content-length”));

/*
Map fields = con.getHeaderFields();
Set set = fields.entrySet();
Iterator iterator = set.iterator();
while(iterator.hasNext()) {
Map.Entry me = (Map.Entry)iterator.next();
System.out.print(me.getKey() + “: “);
System.out.println(me.getValue());
}
*/
//InputStream is = con.getInputStream();
InputStream is = url.openStream();
//BufferedInputStream bis = new BufferedInputStream(is);

Configuration conf = new Configuration();
//file to be created
//Path file = new Path(“/user/root/input/test_input.bam”);
Path file = new Path(args[1]);
//initiate hdfs
DistributedFileSystem dfs = new DistributedFileSystem();
//dfs.initialize(new URI(“hdfs://ip-10-224-53-114.ec2.internal:50001”), conf); //fs.default.name
dfs.initialize(new URI(args[2]), conf);
FsPermission permissions = new FsPermission(“750″);
FSDataOutputStream out = null;
int bufferSize = 65536000;
long blockSize = 65536000;//64M
int totalBlocks = (int)(fileSize/blockSize);
//System.out.println(totalBlocks);
boolean overWrite = true;
try{
out = dfs.create(file,permissions,overWrite, bufferSize, (short)3, blockSize, null);
}catch(Exception e){
e.printStackTrace();
}

byte[] buf = new byte[bufferSize];
int n = is.read(buf);
/*
while (n >= 0){
out.write(buf, 0, n);
System.out.print(n+”.”);
n = is.read(buf);
}
*/
//dealing with network inputStream, block until the buf is fully filled
int end = 0;
double blockRead = 0;//generates double in the division operation, avoid casting
while (true){
while (n != buf.length){
int ret = is.read(buf, n, buf.length – n);
if (ret == -1) {
end = 1;
break;
}
n += ret;
}
out.write(buf, 0, n);
blockRead++;
if (fileSize > 0){
updateProgress((blockRead/totalBlocks));
}
//System.out.print(“.”);
n = 0;
if (end == 1){
break;
}
}

out.close();
is.close();
//bis.close();

} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

static void updateProgress(double progressPercentage) {
final int width = 100; // progress bar width in chars
System.out.print(“\r[“);
int i = 0;
for (; i <= (int)(progressPercentage*width); i++) {
System.out.print(“>”);
}
for (; i < width; i++) {
System.out.print(“|”);
}
System.out.print(“]”);
}
}

How to compile it?
Specify java classpath and include the following four jars:
commons-logging-1.1.1.jar
hadoop-core-0.20.203.0.jar
commons-configuration-1.6.jar
commons-lang-2.4.jar

jdk1.7.0_04/bin/javac -classpath .:/usr/local/hadoop-0.20.203.0/lib/commons-logging-1.1.1.jar:/usr/local/hadoop-0.20.203.0/hadoop-core-0.20.203.0.jar:/usr/local/hadoop-0.20.203.0/lib/commons-configuration-1.6.jar:/usr/local/hadoop-0.20.203.0/lib/commons-lang-2.4.jar RemoteUploadData.java

How to run it?
jdk1.7.0_04/bin/java -classpath .:/usr/local/hadoop-0.20.203.0/lib/commons-logging-1.1.1.jar:/usr/local/hadoop-0.20.203.0/hadoop-core-0.20.203.0.jar:/usr/local/hadoop-0.20.203.0/lib/commons-configuration-1.6.jar:/usr/local/hadoop-0.20.203.0/lib/commons-lang-2.4.jar RemoteUploadData http://ec2-184-73-123-11.compute-1.amazonaws.com/cnv_alignment_files/control_coverage_5.sam /user/root/input/test_input.bam3 hdfs://ip-10-224-53-114.ec2.internal:50001

Read Full Post »

putty shortcuts

Posted in Linux/Unix on August 16, 2012| Leave a Comment »

Move the point to the beginning of the line (Home or contral+a or fn+left arrow)

Move the point to the end of the line (End or control+e or fn+right arraw)

Read Full Post »

BioInCloud

Archive for August, 2012

How to split a large BAM file by alignment gap

upload data on a machine out of Hadoop cluster to HDFS

putty shortcuts

Recent Posts

Archives

Categories

Meta