1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
//! Perform random operations on fastq files, using unix streaming.
//! Secure your analysis with Fasten!
//! # Synopsis
//! ## read metrics
//! ```text
//!
//! $ cat testdata/R1.fastq testdata/R2.fastq | \
//! fasten_shuffle | fasten_metrics | column -t
//! totalLength numReads avgReadLength avgQual
//! 800 8 100 19.53875
//! ```
//!
//! ## read cleaning
//!
//! ```text
//! $ cat testdata/R1.fastq testdata/R2.fastq | \
//! fasten_shuffle | \
//! fasten_clean --paired-end --min-length 2 | \
//! gzip -c > cleaned.shuffled.fastq.gz
//!
//! $ zcat cleaned.shuffled.fastq.gz | fasten_metrics | column -t
//! totalLength numReads avgReadLength avgQual
//! 800 8 100 19.53875
//! ```
//! _NOTE_: No reads were actually filtered with cleaning, with --min-length=2
//!
//! ## Kmer counting
//! ```text
//! $ cat testdata/R1.fastq | \
//! fasten_kmer -k 21 > 21mers.tsv
//! ```
//!
//! ## Read sampling
//! ```text
//! $ cat testdata/R1.fastq testdata/R2.fastq | \
//! fasten_shuffle | \
//! fasten_sample --paired-end --frequency 0.1 > 10percent.fastq
//! ```
//!
//! # Advanced
//! ## Set of downsampled reads
//! Create a set of downsampled reads for a titration experiment
//! and clean them
//! ```text
//! for frequency in 0.1 0.2 0.3 0.4 0.5; do
//! cat testdata/R1.fastq testdata/R2.fastq | \
//! fasten_shuffle | \
//! fasten_clean --paired-end --min-length 50 --min-trim-quality 25
//! fasten_sample --paired-end --frequency $frequency > cleaned.$frequency.fastq
//! done
//! ```
//!
//! ## Validate a whole directory of fastq reads
//! ```text
//! \ls *_1.fastq.gz | xargs -n 1 -P 4 bash -c '
//! echo -n "." >&2 # progress bar
//! R1=$0
//! R2=${0/_1.fastq.gz/_2.fastq.gz}
//! zcat $R1 $R2 | fasten_shuffle | fasten_validate --paired-end
//! '
//! ```
extern crate regex;
extern crate statistical;
extern crate getopts;
use std::env;
use std::path::Path;
use std::collections::HashMap;
use getopts::Options;
use getopts::Matches;
/// input/output methods
pub mod io;
const VERSION: &'static str = env!("CARGO_PKG_VERSION");
/// Have some strings that can be printed which could be
/// used to propagate errors between piped scripts.
/// Invalid fastq ID (no @)
static INVALID_ID :&'static str= "invalid_id";
/// Invalid sequence (underscore)
static INVALID_SEQ :&'static str= "invalid_seq";
/// Invalid plus line (no +)
static INVALID_PLUS:&'static str= "invalid_plus";
/// Invalid qual line (~ is chr 126 when the normal max number is 40)
static INVALID_QUAL:&'static str= "invalid_qual";
/// Propagate an error by printing invalid read(s)
pub fn eexit() -> () {
println!("{}\n{}\n{}\n{}",INVALID_ID,INVALID_SEQ,INVALID_PLUS,INVALID_QUAL);
std::process::exit(1);
}
/// Rewrite print!() so that it doesn't panic on broken
/// pipe.
#[macro_export]
macro_rules! print (
// The extra scope is necessary so we don't leak imports
($($arg:tt)*) => ({
// The `write!()` macro is written so it can use `std::io::Write`
// or `std::fmt::Write`, this import sets which to use
use std::io::{self, Write};
if let Err(_) = write!(io::stdout(), $($arg)*) {
// Optionally write the error to stderr
::std::process::exit(0);
}
})
);
/// a function that reads an options object and adds fasten default options.
pub fn fasten_base_options() -> Options{
let mut opts = Options::new();
opts.optflag("h", "help", "Print this help menu.");
opts.optopt("n","numcpus","Number of CPUs (default: 1)","INT");
opts.optflag("p","paired-end","The input reads are interleaved paired-end");
opts.optflag("","verbose","Print more status messages");
opts.optflag("","version","Print the version of Fasten and exit");
return opts;
}
/// a function that processes the options on the command line
/// The brief is a str that describes the program without using the program
/// name, e.g., "counts kmers" for fasten_kmer.
/// This function also takes care of --version.
/// If --help is invoked, then the program name, the brief, and the usage()
/// are all printed to stdout and then the program exits with 0.
// TODO if possible add in default somehow for numcpus
pub fn fasten_base_options_matches(brief:&str, opts:Options) -> Matches{
let args: Vec<String> = env::args().collect();
let matches = opts.parse(&args[1..]).expect("ERROR: could not parse parameters");
if matches.opt_present("version") {
println!("Fasten v{}", &VERSION);
std::process::exit(0);
}
if matches.opt_present("h") {
let prog_name = Path::new(&args[0])
.file_stem().unwrap()
.to_str().unwrap();
println!("{}: {}\n\n{}",
&prog_name,
&brief,
&opts.usage(
&opts.short_usage(&prog_name)
),
);
std::process::exit(0);
}
return matches;
}
/// Print a formatted message to stderr
pub fn logmsg<S: AsRef<str>>(stringlike: S) {
let args: Vec<_> = env::args().collect();
// is there a better way to get the basename of the program???
let program = Path::file_name(Path::new(&args[0])).unwrap().to_str().unwrap();
let str_ref = stringlike.as_ref();
eprintln!("{}: {}", &program, str_ref);
}
/// Reverse complement a DNA sequence.
/// Take into account lowercase vs uppercase.
/// Ambiguity codes are also handled.
pub fn reverse_complement(dna: &str) -> String {
// Create a mapping for complement bases, including ambiguity codes.
let complement_map: HashMap<char, char> = [
('A', 'T'), ('T', 'A'), ('G', 'C'), ('C', 'G'),
('R', 'Y'), ('Y', 'R'), ('S', 'S'), ('W', 'W'),
('K', 'M'), ('M', 'K'), ('B', 'V'), ('V', 'B'),
('D', 'H'), ('H', 'D'), ('N', 'N'),
('a', 't'), ('t', 'a'), ('g', 'c'), ('c', 'g'),
('r', 'y'), ('y', 'r'), ('s', 's'), ('w', 'w'),
('k', 'm'), ('m', 'k'), ('b', 'v'), ('v', 'b'),
('d', 'h'), ('h', 'd'), ('n', 'n'),
]
.iter()
.cloned()
.collect();
// Generate the reverse complement.
dna.chars()
.rev()
.map(|base| complement_map.get(&base).cloned().unwrap_or('N')) // Default to 'N' for unknown bases.
.collect()
}