1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
//! Sort a fastq file.
//! If the reads are paired end, then the sorted field 
//! concatenates R1 and R2 before comparisons in the sort.
//! R1 and R2 reads will stay together if paired end.
//! 
//! Sorting by GC content will give better compression by magic of gzip
//! and other algorithms.
//! 
//! Sorting can also aid in stable hashsums.
//! 
//! # Examples
//! 
//! ## stable hashsum
//! ```bash
//! cat file.fastq | fasten_sort | md5sum > file.fastq.md5
//! ```
//! ## better compression by sorting by GC content
//! ```bash
//! zcat file.fastq.gz | fasten_sort --sort-by GC | gzip -c > smaller.fastq.gz
//! 
//! ## get good compression from paired end reads
//! ```bash
//! zcat R1.fastq.gz R2.fastq.gz | fasten_shuffle | \
//!   fasten_sort --paired-end --sort-by GC | \
//!   fasten_shuffle -d -1 sorted_1.fastq -2 sorted_2.fastq && \
//!   gzip -v sorted_1.fastq sorted_2.fastq
//! ```
//! 
//! Compare compression between unsorted and sorted
//! from the previous example
//! 
//! ```bash
//! ls -lh sorted_1.fastq.gz sorted_2.fastq.gz
//! ```
//! 
//! ## Fast sorting of large files
//! 
//! If you want reads sorted but do not care if _everything_ is sorted,
//! you can sort in chunks. This is useful for streaming large files.
//! 
//! ```bash
//! zcat large.fastq.gz | fasten_sort --paired-end --chunk-size 1000 | gzip -c > sorted.fastq.gz
//! ```
//! 
//! # Usage 
//! 
//! ```text
//! Usage: fasten_sort [-h] [-n INT] [-p] [-v] [-s STRING] [-r]
//!
//! Options:
//!     -h, --help          Print this help menu.
//!     -n, --numcpus INT   Number of CPUs (default: 1)
//!     -p, --paired-end    The input reads are interleaved paired-end
//!     -v, --verbose       Print more status messages
//!     -s, --sort-by STRING
//!                         Sort by either SEQ, GC, or ID. If GC, then the entries
//!                         are sorted by GC percentage. SEQ and ID are
//!                         alphabetically sorted.
//!     -r, --reverse       Reverse sort
//!     -c, --chunk-size INT
//!                         If > 0, then chunks of reads or pairs will be sorted
//!                         instead of the whole set. This is useful for streaming
//!                         large files. Default: 0
//! ```

extern crate getopts;
extern crate fasten;
extern crate regex;
extern crate threadpool;
use std::fs::File;
use std::io::BufReader;
use std::io::BufRead;

use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
//use fasten::logmsg;

#[test]
fn test_sort_fastq_basic () {
    let is_paired_end = false;
    let which_field   = "ID";
    let my_file = File::open("testdata/four_reads.fastq").expect("Could not open file");
    let my_buffer=BufReader::new(my_file);
    let mut buffer_iter = my_buffer.lines();

    let mut entries:Vec<Seq> = vec![];
    while let Some(line) = buffer_iter.next() {
        let id1  = line.expect("ERROR reading the ID line");
        let seq1 = buffer_iter.next().expect("ERROR reading a sequence line")
            .expect("ERROR reading a sequence line");
                  buffer_iter.next().expect("ERROR reading a plus line")
                      .expect("ERROR reading the plus line");
        let qual1= buffer_iter.next().expect("ERROR reading a qual line")
            .expect("ERROR reading a qual line");

        let mut id2   =String::new();
        let mut seq2  =String::new();
        let mut qual2 =String::new();

        // Get R2
        if is_paired_end {
            id2  = buffer_iter.next().expect("ERROR reading the ID line")
                .expect("ERROR reading the ID line");
            seq2 = buffer_iter.next().expect("ERROR reading a sequence line")
                .expect("ERROR reading a sequence line");
                       buffer_iter.next().expect("ERROR reading a plus line")
                          .expect("ERROR reading the plus line");
            qual2= buffer_iter.next().expect("ERROR reading a qual line")
                .expect("ERROR reading a qual line");
        }

        let entry:Seq = Seq {
            pe:     is_paired_end,
            id1:    id1,
            seq1:   seq1,
            qual1:  qual1,
            minimizer1: String::new(),
            id2:    id2,
            seq2:   seq2,
            qual2:  qual2,
            minimizer2: String::new()
        };

        entries.push(entry);

    }

    // test that each read is readX in a sorted order
    let sorted_entries:Vec<Seq> = sort_entries(entries.clone(), &which_field, false);
    for i in 0..4 {
        let expected = format!("@read{}",i);
        assert_eq!(sorted_entries[i].id1, expected, "Sort by ID");
    }

    // test reverse sorting
    let reverse_sorted_entries:Vec<Seq> = sort_entries(entries.clone(), &which_field, true);
    for i in 0..4 {
        let expected = format!("@read{}",i);
        let idx = 3 - i;
        assert_eq!(reverse_sorted_entries[idx].id1, expected, "Reverse sort by ID. Full list is {:?}", reverse_sorted_entries);
    }
}

/// A sequence struct that is paired-end aware
#[derive(Debug, Clone)]
struct Seq {
    pe:     bool,
    id1:    String,
    seq1:   String,
    qual1:  String,
    minimizer1: String,
    id2:    String,
    seq2:   String,
    qual2:  String,
    minimizer2: String,
}

fn main(){
    let mut opts = fasten_base_options();
    // Options specific to this script
    opts.optopt("s","sort-by","Sort by either SEQ, MINIMIZER, GC, or ID. If GC, then the entries are sorted by GC percentage. SEQ and ID are alphabetically sorted.","STRING");
    opts.optopt("k", "kmer-length", "Length of kmer if using minimizers", "STRING");
    opts.optflag("r","reverse","Reverse sort");
    opts.optopt("c", "chunk-size", "If > 0, then chunks of reads or pairs will be sorted instead of the whole set. This is useful for streaming large files. Default: 0", "INT");

    let matches = fasten_base_options_matches("Sort reads. This can be useful for many things including checksums and reducing gzip file sizes. Remember to use --paired-end if applicable.", opts);

    let my_file = File::open("/dev/stdin").expect("Could not open file");
    let my_buffer=BufReader::new(my_file);

    let is_paired_end=matches.opt_present("paired-end");
    let reverse_sort =matches.opt_present("reverse");

    let which_field:String={
        if matches.opt_present("sort-by") {
            matches.opt_str("sort-by").expect("ERROR parsing --sort-by")
                .to_uppercase()
        } else {
            "ID".to_string()
        }
    };

    let _num_cpus:usize = {
        if matches.opt_present("numcpus") {
            matches.opt_str("numcpus").expect("ERROR parsing --numcpus")
                .parse()
                .expect("ERROR: numcpus is not an integer")
        } else {
            1
        }
    };

    let k: usize = matches.opt_get_default("kmer-length", 21).expect("ERROR parsing --kmer-length");
    let chunk_size: usize = matches.opt_get_default("chunk-size", 0).expect("ERROR parsing --chunk-size");

    let mut buffer_iter = my_buffer.lines();

    let mut entries:Vec<Seq> = vec![];
    while let Some(line) = buffer_iter.next() {
        let id1  = line.expect("ERROR reading the ID line");
        let seq1 = buffer_iter.next().expect("ERROR reading a sequence line")
            .expect("ERROR reading a sequence line");
                  buffer_iter.next().expect("ERROR reading a plus line")
                      .expect("ERROR reading the plus line");
        let qual1= buffer_iter.next().expect("ERROR reading a qual line")
            .expect("ERROR reading a qual line");

        let mut id2   =String::new();
        let mut seq2  =String::new();
        let mut qual2 =String::new();

        // Get R2
        if is_paired_end {
            id2  = buffer_iter.next().expect("ERROR reading the ID line")
                .expect("ERROR reading the ID line");
            seq2 = buffer_iter.next().expect("ERROR reading a sequence line")
                .expect("ERROR reading a sequence line");
                       buffer_iter.next().expect("ERROR reading a plus line")
                          .expect("ERROR reading the plus line");
            qual2= buffer_iter.next().expect("ERROR reading a qual line")
                .expect("ERROR reading a qual line");
        }

        // minimizers: they are costly if we aren't using them
        let mut minimizer1 = String::new();
        let mut minimizer2 = String::new();
        if which_field == "MINIMIZER" {
            minimizer1 = minimizer(&seq1, k);
            minimizer2 = minimizer(&seq2, k);
        }

        let entry:Seq = Seq {
            pe:     is_paired_end,
            id1:    id1, //format!("{} minimizer:{}",id1,minimizer1),
            seq1:   seq1,
            qual1:  qual1,
            minimizer1: minimizer1,
            id2:    id2,
            seq2:   seq2,
            qual2:  qual2,
            minimizer2: minimizer2
        };

        entries.push(entry);

        // If we are chunking, then sort and print the chunk
        if chunk_size > 0 && entries.len() == chunk_size {
            let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
            for entry in sorted_entries {
                println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
                if entry.pe {
                    println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
                }
            }
            entries = vec![];
        }

    }

    // If we aren't chunking then just print everything sorted
    if chunk_size == 0 {
        let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
        for entry in sorted_entries {
            println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
            if entry.pe {
                println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
            }
        }
    }

}

/// Find the lexicographically smallest kmer in a sequence.
fn minimizer (sequence:&str, k: usize) -> String {

    // if the length of the sequence is less than k, then return the sequence
    if sequence.len() < k {
        return sequence.to_string();
    }
    
    let mut min = "z".repeat(k);
    for i in 0..sequence.len()-k {
        let kmer = &sequence[i..i+k];
        if kmer < &min {
            min = kmer.to_string();
        }
    }

    return min;
}

/// Sort fastq entries in a vector
fn sort_entries (unsorted:Vec<Seq>, which_field:&str, reverse_sort:bool) -> Vec<Seq> {

    //logmsg(&format!("REVERSE SORT? {}\n{:?}",reverse_sort, &unsorted));

    // I want to avoid editing the vector in place
    let mut sorted = unsorted.clone();

    // Actual sort
    // TODO sort by length?
    match which_field{
        "ID" => {
            sorted.sort_by(|a, b| {
                if a.id1 != b.id1 {
                    a.id1.cmp(&b.id1)
                } else {
                    a.id2.cmp(&b.id2)
                }
            });
        },
        "SEQ" => {
            sorted.sort_by(|a, b| {
                if a.seq1 != b.seq1 {
                    a.seq1.cmp(&b.seq1)
                } else {
                    a.seq2.cmp(&b.seq2)
                }
            });
        },
        "GC"  => {
            sorted.sort_by(|a,b| {
                let a_seq = format!("{}{}", a.seq1, a.seq2);
                let b_seq = format!("{}{}", b.seq1, b.seq2);

                let a_gc:f32  = a_seq.matches(&['G','g','C','c']).count() as f32 / a_seq.len() as f32;
                let b_gc:f32  = b_seq.matches(&['G','g','C','c']).count() as f32 / b_seq.len() as f32;
                //logmsg(format!("{} ({}) <=> {} ({})", a.id1, a_gc, b.id1, b_gc));
                a_gc.partial_cmp(&b_gc).unwrap()
            });
        },
        "MINIMIZER" => {
            sorted.sort_by(|a,b| {
                // Sort by minimizer1 first
                if a.minimizer1 != b.minimizer1 {
                    a.minimizer1.cmp(&b.minimizer1)
                }
                // If the R1 minimizer isn't the same, then compare on minimizer2
                else if a.minimizer2 != b.minimizer2 {
                    a.minimizer2.cmp(&b.minimizer2)
                }
                // If both minimizers are the same, then sort by R1 SEQ
                else if a.seq1 != b.seq1 {
                    a.seq1.cmp(&b.seq1)
                }
                // and then lastely by R2 SEQ
                else {
                    a.seq2.cmp(&b.seq2)
                }
            });
        },
        _   => {
            panic!("Tried to sort by {} which is not implemented", which_field);
        }
    };

    if reverse_sort {
        //logmsg("REVERSE SORT");
        sorted.reverse();
    }

    return sorted;
}