1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
//! Perform random operations on fastq files, using unix streaming.
//! Secure your analysis with Fasten!
//! # Synopsis
//! ## read metrics
//! ```text
//! 
//! $ cat testdata/R1.fastq testdata/R2.fastq | \
//!     fasten_shuffle | fasten_metrics | column -t
//! totalLength  numReads  avgReadLength  avgQual
//! 800          8         100            19.53875
//! ```
//! 
//! ## read cleaning
//!
//! ```text
//! $ cat testdata/R1.fastq testdata/R2.fastq | \
//!     fasten_shuffle | \
//!     fasten_clean --paired-end --min-length 2 | \
//!     gzip -c > cleaned.shuffled.fastq.gz
//! 
//! $ zcat cleaned.shuffled.fastq.gz | fasten_metrics | column -t
//! totalLength  numReads  avgReadLength  avgQual
//! 800          8         100            19.53875
//! ```
//! _NOTE_: No reads were actually filtered with cleaning, with --min-length=2
//!
//! ## Kmer counting
//! ```text
//! $ cat testdata/R1.fastq | \
//!   fasten_kmer -k 21 > 21mers.tsv
//! ```
//! 
//! ## Read sampling
//! ```text
//! $ cat testdata/R1.fastq testdata/R2.fastq | \
//!     fasten_shuffle | \
//!     fasten_sample --paired-end --frequency 0.1 > 10percent.fastq
//! ```
//!
//! # Advanced
//! ## Set of downsampled reads
//! Create a set of downsampled reads for a titration experiment
//! and clean them
//! ```text
//! for frequency in 0.1 0.2 0.3 0.4 0.5; do
//!   cat testdata/R1.fastq testdata/R2.fastq | \
//!     fasten_shuffle | \
//!     fasten_clean --paired-end --min-length 50 --min-trim-quality 25
//!     fasten_sample --paired-end --frequency $frequency > cleaned.$frequency.fastq
//! done
//! ```
//!
//! ## Validate a whole directory of fastq reads
//! ```text
//! \ls *_1.fastq.gz | xargs -n 1 -P 4 bash -c '
//!   echo -n "." >&2 # progress bar
//!   R1=$0
//!   R2=${0/_1.fastq.gz/_2.fastq.gz}
//!   zcat $R1 $R2 | fasten_shuffle | fasten_validate --paired-end
//! '
//! ```

extern crate regex;
extern crate statistical;
extern crate getopts;
use std::env;
use std::path::Path;
use std::collections::HashMap;

use getopts::Options;
use getopts::Matches;

/// input/output methods
pub mod io;

const VERSION: &'static str = env!("CARGO_PKG_VERSION");

/// Have some strings that can be printed which could be
/// used to propagate errors between piped scripts.

/// Invalid fastq ID (no @)
static INVALID_ID  :&'static str= "invalid_id";
/// Invalid sequence (underscore)
static INVALID_SEQ :&'static str= "invalid_seq";
/// Invalid plus line (no +)
static INVALID_PLUS:&'static str= "invalid_plus";
/// Invalid qual line (~ is chr 126 when the normal max number is 40)
static INVALID_QUAL:&'static str= "invalid_qual";

/// Propagate an error by printing invalid read(s)
pub fn eexit() -> () {
    println!("{}\n{}\n{}\n{}",INVALID_ID,INVALID_SEQ,INVALID_PLUS,INVALID_QUAL);
    std::process::exit(1);
}

/// Rewrite print!() so that it doesn't panic on broken
/// pipe.
#[macro_export]
macro_rules! print (
    // The extra scope is necessary so we don't leak imports
    ($($arg:tt)*) => ({
        // The `write!()` macro is written so it can use `std::io::Write`
        // or `std::fmt::Write`, this import sets which to use
        use std::io::{self, Write};
        if let Err(_) = write!(io::stdout(), $($arg)*) {
            // Optionally write the error to stderr
            ::std::process::exit(0);
        }
        
    })
);

/// a function that reads an options object and adds fasten default options.
pub fn fasten_base_options() -> Options{
    let mut opts = Options::new();
    opts.optflag("h", "help", "Print this help menu.");
    opts.optopt("n","numcpus","Number of CPUs (default: 1)","INT");
    opts.optflag("p","paired-end","The input reads are interleaved paired-end");
    opts.optflag("","verbose","Print more status messages");
    opts.optflag("","version","Print the version of Fasten and exit");

    return opts;
}

/// a function that processes the options on the command line
/// The brief is a str that describes the program without using the program
/// name, e.g., "counts kmers" for fasten_kmer.
/// This function also takes care of --version.
/// If --help is invoked, then the program name, the brief, and the usage()
/// are all printed to stdout and then the program exits with 0.
// TODO if possible add in default somehow for numcpus
pub fn fasten_base_options_matches(brief:&str, opts:Options) -> Matches{
    let args: Vec<String> = env::args().collect();
    let matches = opts.parse(&args[1..]).expect("ERROR: could not parse parameters");

    if matches.opt_present("version") {
        println!("Fasten v{}", &VERSION);
        std::process::exit(0);
    }
    if matches.opt_present("h") {
        let prog_name = Path::new(&args[0])
            .file_stem().unwrap()
            .to_str().unwrap();
        println!("{}: {}\n\n{}", 
            &prog_name,
            &brief,
            &opts.usage(
                &opts.short_usage(&prog_name)
            ),
        );
        std::process::exit(0);
    }

    return matches;
}

/// Print a formatted message to stderr 
pub fn logmsg<S: AsRef<str>>(stringlike: S) {
    let args: Vec<_> = env::args().collect();
    // is there a better way to get the basename of the program???
    let program = Path::file_name(Path::new(&args[0])).unwrap().to_str().unwrap();
    let str_ref = stringlike.as_ref();
    eprintln!("{}: {}", &program, str_ref);
}

/// Reverse complement a DNA sequence.
/// Take into account lowercase vs uppercase.
/// Ambiguity codes are also handled.
pub fn reverse_complement(dna: &str) -> String {
    // Create a mapping for complement bases, including ambiguity codes.
    let complement_map: HashMap<char, char> = [
        ('A', 'T'), ('T', 'A'), ('G', 'C'), ('C', 'G'),
        ('R', 'Y'), ('Y', 'R'), ('S', 'S'), ('W', 'W'),
        ('K', 'M'), ('M', 'K'), ('B', 'V'), ('V', 'B'),
        ('D', 'H'), ('H', 'D'), ('N', 'N'),
        ('a', 't'), ('t', 'a'), ('g', 'c'), ('c', 'g'),
        ('r', 'y'), ('y', 'r'), ('s', 's'), ('w', 'w'),
        ('k', 'm'), ('m', 'k'), ('b', 'v'), ('v', 'b'),
        ('d', 'h'), ('h', 'd'), ('n', 'n'),
    ]
    .iter()
    .cloned()
    .collect();

    // Generate the reverse complement.
    dna.chars()
        .rev()
        .map(|base| complement_map.get(&base).cloned().unwrap_or('N')) // Default to 'N' for unknown bases.
        .collect()
}