1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
//! Sort a fastq file.
//! If the reads are paired end, then the sorted field 
//! concatenates R1 and R2 before comparisons in the sort.
//! R1 and R2 reads will stay together if paired end.
//! 
//! Sorting by GC content will give better compression by magic of gzip
//! and other algorithms.
//! 
//! Sorting can also aid in stable hashsums.
//! 
//! # Examples
//! 
//! ## stable hashsum
//! ```bash
//! cat file.fastq | fasten_sort | md5sum > file.fastq.md5
//! ```
//! ## better compression by sorting by GC content
//! ```bash
//! zcat file.fastq.gz | fasten_sort --sort-by GC | gzip -c > smaller.fastq.gz
//! 
//! ## get good compression from paired end reads
//! ```bash
//! zcat R1.fastq.gz R2.fastq.gz | fasten_shuffle | \
//!   fasten_sort --paired-end --sort-by GC | \
//!   fasten_shuffle -d -1 sorted_1.fastq -2 sorted_2.fastq && \
//!   gzip -v sorted_1.fastq sorted_2.fastq
//! ```
//! Compare compression between unsorted and sorted
//! from the previous example
//! ```bash
//! ls -lh sorted_1.fastq.gz sorted_2.fastq.gz
//! ```
//! 
//! # Usage 
//! 
//! ```text
//! Usage: fasten_sort [-h] [-n INT] [-p] [-v] [-s STRING] [-r]
//!
//! Options:
//!     -h, --help          Print this help menu.
//!     -n, --numcpus INT   Number of CPUs (default: 1)
//!     -p, --paired-end    The input reads are interleaved paired-end
//!     -v, --verbose       Print more status messages
//!     -s, --sort-by STRING
//!                         Sort by either SEQ, GC, or ID. If GC, then the entries
//!                         are sorted by GC percentage. SEQ and ID are
//!                         alphabetically sorted.
//!     -r, --reverse       Reverse sort
//! ```

extern crate getopts;
extern crate fasten;
extern crate regex;
extern crate threadpool;
use std::fs::File;
use std::io::BufReader;
use std::io::BufRead;

use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
//use fasten::logmsg;

#[test]
fn test_sort_fastq_basic () {
    let is_paired_end = false;
    let which_field   = "ID";
    let my_file = File::open("testdata/four_reads.fastq").expect("Could not open file");
    let my_buffer=BufReader::new(my_file);
    let mut buffer_iter = my_buffer.lines();

    let mut entries:Vec<Seq> = vec![];
    while let Some(line) = buffer_iter.next() {
        let id1  = line.expect("ERROR reading the ID line");
        let seq1 = buffer_iter.next().expect("ERROR reading a sequence line")
            .expect("ERROR reading a sequence line");
                  buffer_iter.next().expect("ERROR reading a plus line")
                      .expect("ERROR reading the plus line");
        let qual1= buffer_iter.next().expect("ERROR reading a qual line")
            .expect("ERROR reading a qual line");

        let mut id2   =String::new();
        let mut seq2  =String::new();
        let mut qual2 =String::new();

        // Get R2
        if is_paired_end {
            id2  = buffer_iter.next().expect("ERROR reading the ID line")
                .expect("ERROR reading the ID line");
            seq2 = buffer_iter.next().expect("ERROR reading a sequence line")
                .expect("ERROR reading a sequence line");
                       buffer_iter.next().expect("ERROR reading a plus line")
                          .expect("ERROR reading the plus line");
            qual2= buffer_iter.next().expect("ERROR reading a qual line")
                .expect("ERROR reading a qual line");
        }

        let entry:Seq = Seq {
            pe:     is_paired_end,
            id1:    id1,
            seq1:   seq1,
            qual1:  qual1,
            id2:    id2,
            seq2:   seq2,
            qual2:  qual2
        };

        entries.push(entry);

    }

    // test that each read is readX in a sorted order
    let sorted_entries:Vec<Seq> = sort_entries(entries.clone(), &which_field, false);
    for i in 0..4 {
        let expected = format!("@read{}",i);
        assert_eq!(sorted_entries[i].id1, expected, "Sort by ID");
    }

    // test reverse sorting
    let reverse_sorted_entries:Vec<Seq> = sort_entries(entries.clone(), &which_field, true);
    for i in 0..4 {
        let expected = format!("@read{}",i);
        let idx = 3 - i;
        assert_eq!(reverse_sorted_entries[idx].id1, expected, "Reverse sort by ID. Full list is {:?}", reverse_sorted_entries);
    }
}

/// A sequence struct that is paired-end aware
#[derive(Debug, Clone)]
struct Seq {
    pe:     bool,
    id1:    String,
    seq1:   String,
    qual1:  String,
    id2:    String,
    seq2:   String,
    qual2:  String,
}

fn main(){
    let mut opts = fasten_base_options();
    // Options specific to this script
    opts.optopt("s","sort-by","Sort by either SEQ, GC, or ID. If GC, then the entries are sorted by GC percentage. SEQ and ID are alphabetically sorted.","STRING");
    opts.optflag("r","reverse","Reverse sort");

    let matches = fasten_base_options_matches("Sort reads. This can be useful for many things including checksums and reducing gzip file sizes. Remember to use --paired-end if applicable.", opts);

    let my_file = File::open("/dev/stdin").expect("Could not open file");
    let my_buffer=BufReader::new(my_file);

    let is_paired_end=matches.opt_present("paired-end");
    let reverse_sort =matches.opt_present("reverse");

    let which_field:String={
        if matches.opt_present("sort-by") {
            matches.opt_str("sort-by").expect("ERROR parsing --sort-by")
                .to_uppercase()
        } else {
            "ID".to_string()
        }
    };

    let _num_cpus:usize = {
        if matches.opt_present("numcpus") {
            matches.opt_str("numcpus").expect("ERROR parsing --numcpus")
                .parse()
                .expect("ERROR: numcpus is not an integer")
        } else {
            1
        }
    };

    let mut buffer_iter = my_buffer.lines();

    let mut entries:Vec<Seq> = vec![];
    while let Some(line) = buffer_iter.next() {
        let id1  = line.expect("ERROR reading the ID line");
        let seq1 = buffer_iter.next().expect("ERROR reading a sequence line")
            .expect("ERROR reading a sequence line");
                  buffer_iter.next().expect("ERROR reading a plus line")
                      .expect("ERROR reading the plus line");
        let qual1= buffer_iter.next().expect("ERROR reading a qual line")
            .expect("ERROR reading a qual line");

        let mut id2   =String::new();
        let mut seq2  =String::new();
        let mut qual2 =String::new();

        // Get R2
        if is_paired_end {
            id2  = buffer_iter.next().expect("ERROR reading the ID line")
                .expect("ERROR reading the ID line");
            seq2 = buffer_iter.next().expect("ERROR reading a sequence line")
                .expect("ERROR reading a sequence line");
                       buffer_iter.next().expect("ERROR reading a plus line")
                          .expect("ERROR reading the plus line");
            qual2= buffer_iter.next().expect("ERROR reading a qual line")
                .expect("ERROR reading a qual line");
        }

        let entry:Seq = Seq {
            pe:     is_paired_end,
            id1:    id1,
            seq1:   seq1,
            qual1:  qual1,
            id2:    id2,
            seq2:   seq2,
            qual2:  qual2
        };

        entries.push(entry);

    }

    let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
    for entry in sorted_entries {
        println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
        if entry.pe {
            println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
        }
    }

}

/// Sort fastq entries in a vector
fn sort_entries (unsorted:Vec<Seq>, which_field:&str, reverse_sort:bool) -> Vec<Seq> {

    //logmsg(&format!("REVERSE SORT? {}\n{:?}",reverse_sort, &unsorted));

    // I want to avoid editing the vector in place
    let mut sorted = unsorted.clone();

    // Actual sort
    // TODO sort by length?
    match which_field{
        "ID" => {
            sorted.sort_by(|a, b| {
                let a_id = format!("{}{}", a.id1, a.id2);
                let b_id = format!("{}{}", b.id1, b.id2);
                a_id.cmp(&b_id)
            });
        },
        "SEQ" => {
            sorted.sort_by(|a, b| {
                let a_seq = format!("{}{}", a.seq1, a.seq2);
                let b_seq = format!("{}{}", b.seq1, b.seq2);
                a_seq.cmp(&b_seq)
            });
        },
        "GC"  => {
            sorted.sort_by(|a,b| {
                let a_seq = format!("{}{}", a.seq1, a.seq2);
                let b_seq = format!("{}{}", b.seq1, b.seq2);

                let a_gc:f32  = a_seq.matches(&['G','g','C','c']).count() as f32 / a_seq.len() as f32;
                let b_gc:f32  = b_seq.matches(&['G','g','C','c']).count() as f32 / b_seq.len() as f32;
                //logmsg(format!("{} ({}) <=> {} ({})", a.id1, a_gc, b.id1, b_gc));
                a_gc.partial_cmp(&b_gc).unwrap()
            });
        },
        _   => {
            panic!("Tried to sort by {} which is not implemented", which_field);
        }
    };

    if reverse_sort {
        //logmsg("REVERSE SORT");
        sorted.reverse();
    }

    return sorted;
}