1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
//! Normalizes kmer depth by removing some reads from high kmer depths
//! The input has to be from `fasten_kmer --remember-reads` where there are at least three columns:
//! kmer, count, read1, [read2,...]
//!
//! This was inspired by BBNorm and is probably not the exact same algorithm.
//! <https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbnorm-guide/>

//! # Examples

//! ```bash
//! cat testdata/four_reads.fastq | \
//!   fasten_kmer -k 5 --remember-reads | \
//!   fasten_normalize | \
//!   gzip -c > four_reads.normalized.fastq.gz
//! ```
//!
//! Paired end reads
//! 
//! ```bash
//! cat testdata/R[12].fastq | \
//!   fasten_shuffle | \
//!   fasten_kmer -k 3 -m --paired-end | \
//!   fasten_normalize --target-depth 10 --paired-end | \
//!   gzip -c > normalized.fastq.gz
//! ```

//! # Usage

//! ```text
//! Usage: fasten_normalize [-h] [-n INT] [-p] [--verbose] [--version] [-t INT]
//!
//! Options:
//!     -h, --help          Print this help menu.
//!     -n, --numcpus INT   Number of CPUs (default: 1)
//!     -p, --paired-end    The input reads are interleaved paired-end
//!         --verbose       Print more status messages
//!         --version       Print the version of Fasten and exit
//!     -t, --target-depth INT
//!                         The target depth of kmer.
//! ```
//!
//! # Algorithm
//! 
//! `fasten_normalize` will downsample reads pertaining to each kmer.
//! For example, if `AAAA` is found in the `fasten_kmer` output 100
//! times, but you request 10x coverage, it will remove 90% of the 
//! reads pertaining to `AAAA`.
//!
//! Specifically:
//!
//! 1. `fasten_kmer` shows reads that begin with that kmer
//! 2. `fasten_kmer` shows extra columns with R1/R2 if R1 begins with that kmer.
//! If more than one read or read pair begins with that kmer, it will be displayed in
//! subsequent columns.
//! 3. `fasten_normalize` randomly selects reads that begin with that kmer
//! and brings the number of reads down to that target coverage.
//!
//! # Choosing the correct k
//!
//! Choose a kmer length that is unique enough in the genome
//! but that will not be long enough to run into read-level errors.
//! In the examples above, k=3 is likely very short.
//! Starting with something like k=31 is probably a good start.
    
extern crate fasten;
extern crate getopts;
extern crate rand;

use std::io::BufReader;
use std::io::BufRead;
use std::io::stdin;
use std::io::Stdin;
use std::cmp::min;
use rand::prelude::*;

use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
//use fasten::logmsg;

/// Glues together paired end reads internally and is a
/// character not expected in any read
const READ_SEPARATOR :char = '~';

fn main(){
    let mut opts = fasten_base_options();

    // script-specific options
    opts.optopt("t", "target-depth", "The target depth of kmer.", "INT");
    let matches = fasten_base_options_matches("Normalizes reads based on kmer coverage.", opts);

    let target_depth :u32 = matches.opt_str("target-depth")
        .expect("need --target-depth")
        .parse()
        .expect("Convert target-depth to integer");

    let stdin = stdin();

    let paired_end = matches.opt_present("paired-end");
    
    normalize_coverage(stdin, target_depth, paired_end);
}

/// Normalize the coverage to a certain target and print as a fastq
fn normalize_coverage (stdin:Stdin, target_depth:u32, paired_end:bool) {
    // start off a random thing so that we can get random reads later on
    let mut rng = rand::thread_rng();

    // read the file
    let my_buffer=BufReader::new(stdin);
    let mut buffer_iter = my_buffer.lines();
    while let Some(line_opt) = buffer_iter.next() {
        let line = line_opt.expect("read the next line");

        // get the fields: kmer, count, reads...
        let mut f :Vec<&str> = line.split("\t").collect();
        // No need to normalize if there are no reads and therefore nothing in field 3
        if f.len() < 3 {
            continue;
        }

        let kmer_count :Vec<&str> = f.splice(0..2, vec![]).collect();
        //let _kmer = kmer_count[0];
        let count :u32 = kmer_count[1].parse().unwrap();

        // number of reads to keep is the target depth / kmer coverage * number of reads present
        let mut num_reads_to_keep :usize = min(
            (target_depth as f32 / count as f32 * f.len() as f32).ceil() as usize,
            f.len() as usize
        ) as usize;
        if paired_end {
            num_reads_to_keep = (num_reads_to_keep as f32 / 2.0).ceil() as usize;
        }

        //println!("target depth:{} count:{} num reads:{} = {}", target_depth, count, f.len(), num_reads_to_keep);
        
        // shuffle the reads in place
        f.shuffle(&mut rng);
        // take the top X reads
        let reads_to_keep :Vec<&str> = f.splice(0..num_reads_to_keep, vec![]).collect();

        print_reads(reads_to_keep);
    }
}

/// Print the reads in fastq format when given in a single line with `~`
fn print_reads (reads:Vec<&str>) {
    for entry in reads{
        let entry_string = entry.replace(READ_SEPARATOR, "\n");
        println!("{}", entry_string);
    }
}