1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
//! Collapse identical reads into single reads, recalculating quality values.
//! If paired end, then each set of reads must be identical to be collapsed.
//! _Warning_: due to multiple reads collapsing into one, read identifiers will be reconstituted.
//! # Examples
//! ```bash
//! cat testdata/four_reads | fasten_combine > combined.fastq
//! ```
//!## Usage
//!
//!```text
//!Usage: fasten_combine [-h] [-n INT] [-p] [-v] [--max-qual-char CHAR] [--min-qual-char CHAR]
//!
//!Options:
//! -h, --help Print this help menu.
//! -n, --numcpus INT Number of CPUs (default: 1)
//! -p, --paired-end The input reads are interleaved paired-end
//! -v, --verbose Print more status messages
//! --max-qual-char CHAR
//! Maximum quality character (default: I)
//! --min-qual-char CHAR
//! Minimum quality character (default: !)
//!
//! NOTE: range of quality scores is !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
//!```
extern crate fasten;
extern crate fastq;
extern crate getopts;
extern crate rand;
use std::io::stdin;
//use std::fs::File;
//use std::io::BufReader;
//use std::io::BufRead;
use std::collections::HashMap;
use std::f32;
use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
use fastq::{Parser, Record};
use fasten::logmsg;
/// need this constant because the compiler had a problem
/// with the syntax `10.0.pow()`
const TEN: f32 = 10.0;
/// Glues together paired end reads internally and is a
/// character not expected in any read
const READ_SEPARATOR :char = '~';
fn main(){
let mut opts = fasten_base_options();
// make a string of characters like !"#...GHI to represent all quals
let default_phred_min_char:char = '!';
let default_phred_max_char:char = 'I';
let default_phred_min:u8 = default_phred_min_char as u8 - 33;
let default_phred_max:u8 = default_phred_max_char as u8 - 33;
let mut qual_range_string = String::with_capacity((default_phred_max - default_phred_min + 1) as usize);
for phred in default_phred_min..default_phred_max+1 {
qual_range_string.push((phred+33) as char);
}
opts.optopt("","max-qual-char",
format!("Maximum quality character (default: {})", default_phred_max_char).as_str(),
"CHAR");
opts.optopt("","min-qual-char",
format!("Minimum quality character (default: {})", default_phred_min_char).as_str(),
"CHAR");
let description:String = format!("Collapse identical reads into single reads, recalculating quality values. If paired end, then each set of reads must be identical to be collapsed. Warning: due to multiple reads collapsing into one, read identifiers will be reconstituted. NOTE: range of quality scores is {}", qual_range_string);
let matches = fasten_base_options_matches(&description, opts);
let max_qual_char:char = matches.opt_default("max-qual-char", &default_phred_max_char.to_string())
.unwrap_or(String::from(default_phred_max_char))
.parse()
.expect("ERROR converting --max-qual-int value to integer");
let mut min_qual_char:char =
matches.opt_default("min-qual-char", &default_phred_min_char.to_string())
.unwrap_or(String::from(default_phred_min_char))
.parse()
.expect("ERROR converting --min-qual-int value to integer");
if min_qual_char < default_phred_min_char {
logmsg("--min-qual-char was less than the default minimum and so it will be set to the default");
min_qual_char = default_phred_min_char;
}
// Finally turn the choice of qual into an integer
let min_qual:u8 = min_qual_char as u8;
let max_qual:u8 = max_qual_char as u8;
let paired_end = matches.opt_present("paired-end");
let _num_cpus:usize = {
if matches.opt_present("numcpus") {
logmsg("Warning: This script does not make use of --numcpus");
1 as usize
} else {
1 as usize
}
};
// seq => count
let mut seq_count :HashMap<String, u32> =HashMap::new();
// seq => vec![sequence of prob of errors]
let mut seq_error_rate :HashMap<String, Vec<f32>> = HashMap::new();
let parser = Parser::new(stdin());
let mut parser_getter = parser.ref_iter();
parser_getter.advance().expect("Could not read the first fastq entry");
while let Some(record1) = parser_getter.get() {
let mut id:Vec<u8> = record1.head().to_vec();
let mut seq:Vec<u8> = record1.seq().to_vec();
let mut qual:Vec<u8> = record1.qual().to_vec();
if paired_end {
// get the next entry with advance() and then get()
match parser_getter.advance() {
Ok(_) => {},
Err(err) => {
panic!("ERROR: could not read the second entry in a paired end read: {}", err);
}
};
let record2 = &parser_getter.get().expect("ERROR parsing second pair in a paired end read");
let id2:&[u8] = record2.head();
let seq2:&[u8] = record2.seq();
let qual2:&[u8]= record2.qual();
// Add on the separator
id.push(READ_SEPARATOR as u8);
seq.push(READ_SEPARATOR as u8);
qual.push(READ_SEPARATOR as u8);
// Add on the second read
id.extend_from_slice(id2);
seq.extend_from_slice(seq2);
qual.extend_from_slice(qual2);
}
//println!("{:?}", qual);
// Keep track of the counts of identical sequence
let seq_string:String = String::from(
std::str::from_utf8(&seq[..])
.expect("ERROR converting slice to str")
);
/*
let id_string:String = String::from(
std::str::from_utf8(&id[..])
.expect("ERROR converting slice to str")
);
*/
let count = seq_count.entry(seq_string.clone()).or_insert(0);
*count += 1 as u32;
// If this sequence hasn't been seen yet,
// then instantiate the probabilities.
if !seq_error_rate.contains_key(&seq_string) {
let mut qual_vec:Vec<f32> = vec![];
for q in qual {
// Don't mess with the read separator character
if q == READ_SEPARATOR as u8 {
qual_vec.push(q as u8 as f32);
continue;
}
let qual_int = q as u8 as f32 - 33.0;
let p :f32 = TEN.powf((-1.0 * qual_int)/TEN);
qual_vec.push(p);
}
seq_error_rate.insert(seq_string.clone(), qual_vec);
}
//println!("{:?}", seq_error_rate.entry(seq_string));
// If this sequence has been seen yet, then
// start combining the error rates.
else {
// get the base error rate vector
let qual_vec = seq_error_rate.entry(seq_string.clone()).or_insert(Vec::new());
let these_errors = qual.into_iter().map(|qual_char|{
// Don't mess with the read separator character
if qual_char == READ_SEPARATOR as u8 {
return qual_char as u8 as f32;
}
let qual_int = qual_char as u8 as f32 - 33.0;
let p :f32 = TEN.powf((-1.0 * qual_int)/TEN);
return p;
}).collect();
let new_qual = combine_error_vectors(&qual_vec,&these_errors);
*qual_vec = new_qual;
}
match &parser_getter.advance() {
Ok(_) => {},
Err(_) => {break;}
};
}
let mut seq_counter=0;
for (seq,combined_qual) in seq_error_rate {
seq_counter += 1;
//println!("{:?}", seq);continue;
// TODO take care of PE reads
// Make a new cigar line for quality
let mut qual_cigar = String::new();
for p in combined_qual {
let mut qual_recalc :f32 = -TEN * (p).log(TEN)+33.0;
// check for overflow before switching to u8
if qual_recalc.is_infinite() || qual_recalc > max_qual as f32 {
qual_recalc = max_qual as f32;
}
// switch to u8 and then the corresponding char
let mut qual_recalc_char = qual_recalc.round() as u8 as char;
if (qual_recalc_char as u8) > max_qual {
qual_recalc_char = max_qual_char;
}
// a reduction in quality is not expected... but just in case.
if (qual_recalc_char as u8) < min_qual {
qual_recalc_char = min_qual_char;
}
qual_cigar.push(qual_recalc_char);
}
if paired_end {
// split the seq and qual for paired end
let separator_pos = seq.find(READ_SEPARATOR).expect("ERROR finding read separator");
let r1_seq = seq[0..separator_pos].to_string();
let r2_seq = seq[separator_pos+1..].to_string();
let r1_qual= qual_cigar[0..separator_pos].to_string();
let r2_qual= qual_cigar[separator_pos+1..].to_string();
println!("@{}/1\n{}\n+\n{}",seq_counter,r1_seq,r1_qual);
println!("@{}/2\n{}\n+\n{}",seq_counter,r2_seq,r2_qual);
} else {
println!("@{}\n{}\n+\n{}",seq_counter,seq,qual_cigar);
}
}
}
/// Combines vectors of error probabilities
/// such that the rate of error is probability of error
/// from vector one times the probability of error
/// from vector two.
fn combine_error_vectors(errors1 :&Vec<f32>, errors2: &Vec<f32>) -> Vec<f32> {
if errors1.len() != errors2.len() {
panic!("Lengths of error vectors do not match: {} and {}", errors1.len(), errors2.len());
}
let mut errors_iter2=errors2.iter();
let mut new_errors :Vec<f32> = Vec::new(); // TODO set length/capacity to errors.len()
for p1 in errors1 {
let p2 = errors_iter2.next().expect("ERROR: could not get the error probability from the second read");
new_errors.push(p1 * p2);
}
return new_errors;
}
// TODO a function that returns the 'min-seq' which is the
// sequence that comes first alphabetically when compared
// with its revcom
/*
#[allow(dead_code)]
fn recalculate_qual(qual_str: &str, count: u32) -> String {
let mut qual_out = String::new();
let max_qual = 'I' as u8;
let min_qual = '!' as u8;
let qual = qual_str.to_string();
for qual_char in qual.chars() {
let qual_int = qual_char as u8 as f32 - 33.0;
//let ten:f32=10.0;
let p :f32 = TEN.powf((-1.0 * qual_int)/TEN);
let p_recalc :f32 = p.powi(count as i32);
let mut qual_recalc :f32 = -TEN * (p_recalc).log(TEN)+33.0;
// check for overflow before switching to u8
if qual_recalc.is_infinite() || qual_recalc > 255.0 {
qual_recalc = 255.0;
}
// switch to u8 and then the corresponding char
let mut qual_recalc_char = qual_recalc.round() as u8 as char;
if (qual_recalc_char as u8) > max_qual {
qual_recalc_char = 'I';
}
// a reduction in quality is not expected... but just in case.
if (qual_recalc_char as u8) < min_qual {
qual_recalc_char = '!';
}
qual_out.push(qual_recalc_char);
}
return qual_out;
}
*/