fasten_metrics/
fasten_metrics.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
//! Gives read metrics on a read set.
//! Values are given in a column delimited stdout.

//! # Examples

//! ```bash
//! cat testdata/four_reads.fastq | fasten_metrics | column -t
//! ```

//! # Usage
//! ```text

//! Usage: fasten_metrics [-h] [-n INT] [-p] [-v] [--each-read]

//! Options:
//!    -h, --help          Print this help menu.
//!    -n, --numcpus INT   Number of CPUs (default: 1)
//!    -p, --paired-end    The input reads are interleaved paired-end
//!    -v, --verbose       Print more status messages
//!        --each-read     Print the metrics for each read. This creates very
//!                        large output
//! ```
    
extern crate fasten;
extern crate statistical;
extern crate getopts;

use std::fs::File;
use std::io::BufReader;
use std::io::BufRead;
//use std::f32;
use std::f64;

use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
use fasten::logmsg;

#[test]
fn test_average_quality () {
    let easy_qual = "IIIIIIIIIIII";
    let easy_avg_obs = average_quality(easy_qual);
    let easy_avg_exp:f64 = 40.0;
    assert_eq!(easy_avg_obs, easy_avg_exp, "Tried to calculate average quality for {}", easy_qual);

    // a more difficult qual is the one in the first read in four_reads.fastq
    let hard_qual = "8AB*2D>C1'02C+=I@IEFHC7&-E5',I?E*33E/@3#68B%\"!B-/2%(G=*@D052IA!('7-*$+A6>.$89,-CG71=AGAE3&&#=2B.+I<E";
    let hard_avg_obs = average_quality_from_cigar(hard_qual);
    let hard_avg_exp:f64 = 21.40;
    assert_eq!(hard_avg_obs, hard_avg_exp, "Tried to calculate the average quality for {}", hard_qual);
}

fn main(){
    let mut opts = fasten_base_options();

    // script-specific options
    opts.optflag("","each-read","Print the metrics for each read. This creates very large output");

    let matches = fasten_base_options_matches("Gives read metrics on a read set.", opts);

    if matches.opt_present("paired-end") {
        logmsg("WARNING: --paired-end is not utilized in this script");
    }

    let each_read :bool=matches.opt_present("each-read");

    let filename = "/dev/stdin";
    
    // Header
    if each_read {
        println!("readID\treadLength\tavgQual");
    } else {
        println!("{}",vec!["totalLength", "numReads", "avgReadLength","avgQual"].join("\t"));

    }
    
    let mut read_length :Vec<f64> = vec![];
    let mut read_qual   :Vec<u8>  = vec![];
    let mut num_lines   :u64   =0;

    // read the file
    let my_file = File::open(&filename).expect("Could not open file");
    let my_buffer=BufReader::new(my_file);
    for line in my_buffer.lines() {
        num_lines+=1;
        let mod_line = num_lines % 4;

        match mod_line {
            1 => {
                if each_read {
                    let id = line.expect("Expected an ID line");
                    // remove the @
                    print!("{}\t",&id[1..]);
                }
            }
            2 => {
                let my_read_length=line.expect("Expected a sequence line").len() as f64;
                if each_read {
                    print!("{}\t",my_read_length);
                }
                read_length.push(my_read_length);
            }
            0 => {
                let qual_line=line.expect("Expected a qual line");
                
                let my_qual_vec: Vec<u8> = qual_line.into_bytes();
                // TODO this if statement makes the program take twice as long. Optimize?
                if each_read {
                    let my_avg_qual = avg_qual(&my_qual_vec, 33);
                    println!("{:.2}",my_avg_qual);
                }
                read_qual.extend(my_qual_vec.into_iter());

            }
            _ => {

            }
        };
    }
    let num_reads :u64 = num_lines / 4;
    let total_length = read_length.iter().fold(0.0,|a,&b| a+b);

    let mut summary_metrics=vec![total_length.to_string(),num_reads.to_string()];

    // add statistics if requested
    let total_length_str = (total_length as f64/num_reads as f64).to_string();
    let total_qual_str = format!("{:.2}", avg_qual(&read_qual, 33));

    summary_metrics.push(total_length_str);
    summary_metrics.push(total_qual_str.to_string());

    // summary metrics
    if !each_read {
        println!("{}", summary_metrics.join("\t"));
    }

}

/// Calculates average quality value from a vector of quality bytes
fn avg_qual(qual: &[u8], ascii_base: u8) -> f64 {
    if qual.is_empty() {
        return 0.0;
    }
    
    // Parse quality values (assuming Phred scores)
    let qual_values: Vec<f64> = qual.iter()
        .map(|&q| {
            let phred = (q - ascii_base) as i16;
            // Convert Phred to probability: P = 10^(-Q/10)
            10_f64.powf(-phred as f64 / 10.0)
        })
        .collect();
    
    if qual_values.is_empty() {
        return 0.0;
    }
    
    let sum: f64 = qual_values.iter().sum();
    let avg_prob = sum / qual_values.len() as f64;
    
    // Convert average probability back to Phred score
    -10.0 * avg_prob.log10()
}