1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
//! Marks up your reads with useful information like read length
//!
//! # Examples
//!
//! ## Quick validation with stderr message
//! ```bash
//! cat file.fastq | fasten_inspect > markedup.fastq
//! cat file.fastq | fasten_inspect --paired-end > markedup-paired.fastq
//! ```
//!
//! The resulting marked-up fastq file will have deflines like
//!
//! ```text
//! @read0/1 id-at:1 seq-length:100 seq-invalid-chars: id-plus:1 qual-invalid-chars: avg-qual:20.93 qual-length:100 read-pair:1
//! ```
//!
//! # Usage
//!
//! ```text
//!fasten_inspect: Marks up your reads with useful information like read length
//!
//!Usage: fasten_inspect [-h] [-n INT] [-p] [--verbose] [--version]
//!
//!Options:
//! -h, --help Print this help menu.
//! -n, --numcpus INT Number of CPUs (default: 1)
//! -p, --paired-end The input reads are interleaved paired-end
//! --verbose Print more status messages
//! --version Print the version of Fasten and exit
//!
//! ```
//!
//! The fields will be found on the defline of the sequence and include:
//!
//!| key | type | example | note |
//!| --- | ----- | ------- | ------ |
//!| id-at | boolean (1 or 0) | id-at:1 | Whether or not the `@` was first character, first line |
//!| seq-invalid-chars | string | seq-invalid-chars:$$% | |
//!| qual-invalid-chars | string | qual-invalid-chars:[< | |
//!| seq-length | int | seq-length:100 | |
//!| id-plus | boolean | id-plus:1 | Whether or not the `+` was first character, 3rd line |
//!| avg-qual | float | avg-qual:17.52 | |
//!| qual-length | int | qual-length:100 | Length of the quality score line |
//!
//!
// TODO add points that were validated into the sequence deflines: length, is-paired, seq-regex=1, and anything else
extern crate getopts;
extern crate fasten;
extern crate regex;
use std::fs::File;
//use std::io::BufReader;
use std::io::{BufRead,BufReader};
use regex::Regex;
use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
fn main(){
let opts = fasten_base_options();
// Options specific to this script
// opts.optflag("","paired-end","The reads are interleaved paired-end");
let matches = fasten_base_options_matches("Marks up your reads with useful information like read length", opts);
let lines_per_read :u8 ={
if matches.opt_present("paired-end") {
8
}else{
4
}
};
// If there is a match on these, then mark invalid.
// In other words, we are looking for a pattern that
// is NOT the target seq or qual
let seq_regex = Regex::new(r"[^a-zA-Z]").expect("malformed seq regex");
//let qual_regex= Regex::new(r"[^!-Za-z]").expect("malformed qual regex");
let qual_regex= Regex::new(r"\s").expect("malformed qual regex");
validate_reads(lines_per_read, seq_regex, qual_regex);
if matches.opt_present("verbose") {
fasten::logmsg("These reads have been validated!");
}
}
/// marks up reads from stdin
fn validate_reads(lines_per_read: u8, seq_regex: regex::Regex, qual_regex: regex::Regex) {
let my_file = File::open("/dev/stdin").expect("Could not open file");
let mut my_buffer = BufReader::new(my_file);
let mut id = String::new();
let mut seq = String::new();
let mut plus = String::new();
let mut qual = String::new();
let mut i :u64 = 0;
loop{
id.clear();
if my_buffer.read_line(&mut id).expect("Cannot read new line") == 0 {
break;
}
seq.clear();
my_buffer.read_line(&mut seq).expect("ERROR: failed to read 'seq' line");
plus.clear();
my_buffer.read_line(&mut plus).expect("ERROR: failed to read 'plus' line");
qual.clear();
my_buffer.read_line(&mut qual).expect("ERROR: failed to read 'qual' line");
id = id.trim().to_string();
seq = seq.trim().to_string();
plus = plus.trim().to_string();
qual = qual.trim().to_string();
// Test ID
if id.chars().nth(0).unwrap() == '@' {
id = format!("{} id-at:1", &id);
} else {
id = format!("{} id-at:0", &id);
}
// Test Seq
id = format!("{} seq-length:{}", &id, seq.len());
let mut illegal_seq_chars:String = String::new();
if seq_regex.is_match(&seq) {
for cap in seq_regex.captures_iter(&seq) {
illegal_seq_chars.push_str(&cap[0]);
}
}
id = format!("{} seq-invalid-chars:{}", &id, &illegal_seq_chars);
// Test plus
if plus.chars().nth(0).unwrap() == '+' {
id = format!("{} id-plus:1", &id);
} else {
id = format!("{} id-plus:0", &id);
}
// Test qual
let mut illegal_qual_chars:String = String::new();
if qual_regex.is_match(&qual) {
for cap in qual_regex.captures_iter(&qual) {
illegal_qual_chars.push_str(&cap[0]);
}
}
id = format!("{} qual-invalid-chars:{}", &id, &illegal_qual_chars);
// quality score regex
let mut qual_total :usize = 0;
for q in qual.chars() {
qual_total += q as usize;
}
let avg_qual :f32 = {
if qual.len() == 0 {
-1.0
} else {
qual_total as f32 / qual.len() as f32 - 33.0
}
};
id = format!("{} avg-qual:{:.2}", &id, avg_qual);
id = format!("{} qual-length:{}", &id, qual.len());
let mut read_pair:u8 = ((i as u64 % lines_per_read as u64) + 1) as u8;
if read_pair > 1 {
read_pair = 2;
}
id = format!("{} read-pair:{}", &id, &read_pair);
i += 4;
println!("{}\n{}\n{}\n{}", id, seq, plus, qual);
}
}