1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
//! Validates your reads and makes you feel good about yourself!
//! 
//! # Examples
//! 
//! ## Quick validation with stderr message
//! ```bash
//! cat file.fastq | fasten_validate --verbose
//! ```
//!
//! ## Validate that your reads are paired end
//! ```bash
//! cat R1.fastq R2.fastq | fasten_shuffle | fasten_validate --paired-end
//! ```
//!
//! ## Parallelize
//! Large-scale validation of PE reads
//! with 4 CPUs and xargs
//! ```bash
//! \ls *_1.fastq.gz | xargs -n 1 -P 4 bash -c '
//!   echo -n "." >&2 # progress bar
//!   R1=$0
//!   R2=${0/_1.fastq.gz/_2.fastq.gz}
//!   zcat $R1 $R2 | fasten_shuffle | fasten_validate --paired-end
//! '
//! ```
//! 
//! # Usage
//! 
//! ```text
//!     Usage: fasten_validate [-h] [-n INT] [-p] [-v] [--min-length INT] [--min-quality FLOAT] [--paired-end] [--print-reads] [-v]
//!     
//!     Options:
//!         -h, --help          Print this help menu.
//!         -n, --numcpus INT   Number of CPUs (default: 1)
//!         -p, --paired-end    The input reads are interleaved paired-end
//!         -v, --verbose       Print more status messages
//!             --min-length INT
//!                             Minimum read length allowed
//!             --min-quality FLOAT
//!                             Minimum quality allowed
//!             --paired-end    The reads are interleaved paired-end
//!             --print-reads   Print the reads as they are being validated (useful
//!                             for unix pipes)
//! ```

extern crate getopts;
extern crate fasten;
extern crate regex;
use std::fs::File;
use std::io::BufReader;
use std::io::BufRead;

use regex::Regex;

use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
use fasten::logmsg;

fn main(){
    logmsg("NOTE: fasten_validate is deprecated starting in v0.6 in favor of fasten_inspect and fasten_repair");
    let mut opts = fasten_base_options();
    // Options specific to this script
    opts.optopt("","min-length","Minimum read length allowed","INT");
    opts.optopt("","min-quality","Minimum quality allowed","FLOAT");
    opts.optflag("","paired-end","The reads are interleaved paired-end");
    opts.optflag("","print-reads","Print the reads as they are being validated (useful for unix pipes)");

    let matches = fasten_base_options_matches("Validates your reads and makes you feel good about yourself!", opts);

    let my_file = File::open("/dev/stdin").expect("Could not open file");
    let my_buffer=BufReader::new(my_file);

    let lines_per_read={
        if matches.opt_present("paired-end") {
            8
        }else{
            4
        }
    };
    let min_length :usize={
        if matches.opt_present("min-length") {
            matches.opt_str("min-length")
                .expect("ERROR parsing min-length")
                .parse()
                .expect("ERROR parsing min-length as INT")
        } else {
            0
        }
    };
    let min_qual :f32={
        if matches.opt_present("min-quality") {
            matches.opt_str("min-quality")
                .expect("ERROR parsing min-quality")
                .parse()
                .expect("ERROR parsing min-quality as FLOAT")
        } else {
            0.0
        }
    };


    // save this option to avoid the overhead of calling
    // opt_present many times in a loop
    let should_print=matches.opt_present("print-reads");

    // If there is a match on these, then mark invalid.
    // In other words, we are looking for a pattern that
    // is NOT the target seq or qual
    let seq_regex = Regex::new(r"[^a-zA-Z]").expect("malformed seq regex");
    //let qual_regex= Regex::new(r"[^!-Za-z]").expect("malformed qual regex");
    let qual_regex= Regex::new(r"\s").expect("malformed qual regex");

    // TODO have a print buffer, something like 4096 bytes

    let mut i = 0;
    for line in my_buffer.lines() {
        let line=line.expect("ERROR: did not get a line");
        if should_print {
            println!("{}",line);
        }

        // TODO pattern match for each kind of line:
        // id, seq, +, qual
        match i%4{
            0=>{
                if line.chars().nth(0).unwrap() != '@' {
                    panic!("ERROR: first character of the identifier is not @ in the line {}. Contents are:\n  {}",i,line); 
                }
            }
            1=>{
                if seq_regex.is_match(&line) {
                    panic!("ERROR: there are characters that are not in the alphabet in line {}. Contents are:\n  {}",i,line);
                }
                if min_length > 0 && line.len() > min_length {
                    panic!("ERROR: sequence at line {} is less than the minimum sequence length",i);
                }
            }
            2=>{
                if line.chars().nth(0).unwrap() != '+' {
                    panic!("ERROR: first character of the qual identifier is not + in the line {}. Contents are:\n  {}",i,line); 
                }
            }
            3=>{
                if qual_regex.is_match(&line) {
                    for cap in qual_regex.captures_iter(&line) {
                        eprintln!("Illegal qual character found: {}", &cap[0]);
                    }
                    panic!("ERROR: there are characters that are not qual characters in line {}. Contents are:\n  {}",i,line);
                }
                // only calculate read quality if we are testing for it
                if min_qual > 0.0 {
                    let mut qual_total :usize = 0;
                    for q in line.chars() {
                        qual_total += q as usize;
                    }
                    let avg_qual :f32 = qual_total as f32 / line.len() as f32 - 33.0;
                    if avg_qual < min_qual {
                        panic!("ERROR: quality is less than min qual in line {}.\n  Avg qual is {}.\n  Min qual is {}\n  Contents are:\n  {}",i,avg_qual,min_qual,line);
                    }
                }
            }
            _=>{
                panic!("INTERNAL ERROR");
            }
        }
        i += 1;
    }
    if i % lines_per_read > 0{
        panic!("ERROR: incomplete fastq entry. Num lines: {}",i);
    }

    if matches.opt_present("verbose") {
        fasten::logmsg("These reads have been validated!");
    }
}