1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
//! Determine paired-end-ness in an interleaved file.
//! Exit code of 0 indicates PE. Exit code > 0 indicates single end.
//! 
//! # Examples
//! 
//! ## Test the file and then print a message with the exit code
//! ```bash
//! cat file.fastq | fasten_pe; echo "Reads were paired-end? $?";
//! ```
//! ## Test the file and if it is paired end (exit code 0), then print a message
//! ```bash
//! cat file.fastq | fasten_pe || echo "Reads were paired end.";
//! ```
//! 
//! # Usage
//! 
//! ```text
//! Usage: fasten_pe [-h] [-n INT] [-p] [-v] [--print-reads]
//! 
//! Options:
//!     -h, --help          Print this help menu.
//!     -n, --numcpus INT   Number of CPUs (default: 1)
//!     -p, --paired-end    The input reads are interleaved paired-end
//!     -v, --verbose       Print more status messages
//!         --print-reads   Print each read. Useful for Unix pipes.
//! ```    
extern crate fasten;
extern crate regex;

use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
use fasten::logmsg;
use regex::Regex;

use std::fs::File;
use std::io::BufReader;
use std::io::BufRead;

fn main() {
    let mut opts = fasten_base_options();
    opts.optflag("","print-reads","Print each read. Useful for Unix pipes.");
    let matches = fasten_base_options_matches("Determine paired-end-ness in an interleaved file. Exit code of 0 indicates PE. Exit code > 0 indicates SE.", opts);
    
    if matches.opt_present("paired-end") {
        logmsg("WARNING: --paired-end was supplied but this script is supposed to determine whether the input is paired-end.");
    }

    let should_print = matches.opt_present("print-reads");

    // Save the top X ID pairs in a vector
    // and compare them in several functions that test for
    // IDs. If any test returns true, then we can say that
    // it is paired-end input.
    let mut id1_vec :Vec<String>=Vec::new();
    let mut id2_vec :Vec<String>=Vec::new();
    let my_file = File::open("/dev/stdin").expect("Could not open file");
    let my_buffer=BufReader::new(my_file);
    let mut lines = my_buffer.lines();
    while let Some(line) = lines.next() {
        let id1 = line.expect("ERROR parsing id line in R1");
        let mut entry=id1.clone();
        entry.push('\n');
        for _ in 1..4 { // move ahead three lines
            let other_line = lines.next()
                .expect("ERROR getting next line")
                .expect("ERROR parsing next line");
            entry.push_str(&other_line);
            entry.push('\n');
        }
        let id2 = lines.next()
            .expect("ERROR getting R2. This is not a paired end file.")
            .expect("ERROR parsing next line in R2");
        entry.push_str(&id2);
        entry.push('\n');
        for _ in 1..4 { // move ahead three lines
            let other_line = lines.next()
                .expect("ERROR getting next line in R2. This is not a paired end file.")
                .expect("ERROR parsing next line in R2");
            entry.push_str(&other_line);
            entry.push('\n');
        }
        id1_vec.push(id1);
        id2_vec.push(id2);

        if should_print {
            print!("{}",entry);
        }
    }

    // For every function we can think of, throw the ID
    // vectors against them. If any of them return >0,
    // then we have a positive
    let mut is_paired_end :u8 = 0;
    is_paired_end += is_paired_end_slash12(&id1_vec, &id2_vec);
    is_paired_end += is_paired_end_miseq(&id1_vec, &id2_vec);

    // If this is not paired end, return a nonzero
    if is_paired_end == 0 {
        if matches.opt_present("verbose") {
            logmsg("I do not think this is interleaved paired end");
        }
        std::process::exit(1);
    }
    
    if matches.opt_present("verbose") {
        logmsg("The fastq input seems to be interleaved paired-end");
    }

}

/// Detect whether the vector of IDs represent paired-endedness
/// with finding out whether they end with `/1` and `/2`.
fn is_paired_end_slash12(id1_ref: &Vec<String>, id2_ref: &Vec<String>) -> u8 {

    // We are using iterators and want to reset them
    let id1_vec = id1_ref.clone();
    let id2_vec = id2_ref.clone();
    
    // @.../1
    let slash_r1r2_regex = Regex::new(r"(.+)/([12])$").expect("malformed qual regex");

    let mut id2_iter = id2_vec.iter();
    for id1 in id1_vec.iter(){
        let id2 = id2_iter.next()
            .expect("ERROR getting next id2");

        let caps1_result = slash_r1r2_regex.captures(&id1);
        let caps2_result = slash_r1r2_regex.captures(&id2);

        // Test for whether the regex matched
        if caps1_result.is_none() || caps2_result.is_none() {
            return 0;
        }

        // If we have a match then get the captured values
        let caps1 = caps1_result
            .expect("ERROR: could not regex against id1");
        let caps2 = caps2_result
            .expect("ERROR: could not regex against id2");

        // Make sure the base name matches
        if caps1[1] != caps2[1] {
            return 0;
        }
        // Make sure there is a 1/2 combo
        if &caps1[2] != "1" || &caps2[2] != "2" {
            return 0;
        }
    }

    return 1;
}

/// Detect whether the vector of IDs represent paired-endedness
/// with finding out whether they fit the pattern shown in Illumina documentation at
/// <http://support.illumina.com/content/dam/illumina-support/help/BaseSpaceHelp_v2/Content/Vault/Informatics/Sequencing_Analysis/BS/swSEQ_mBS_FASTQFiles.htm>
fn is_paired_end_miseq(id1_ref: &Vec<String>, id2_ref: &Vec<String>) -> u8 {
    // We are using iterators and want to reset them
    let id1_vec = id1_ref.clone();
    let id2_vec = id2_ref.clone();
    let mut id2_iter = id2_vec.iter();

    for id1 in id1_vec.iter(){
        let id2 = id2_iter.next()
            .expect("ERROR getting next id2");


        // Get the 7th field which is a combination of the
        // Y position and the read number, separated by a
        // space.
        let id1_tmp = id1.split(":").nth(6);
        let id2_tmp = id2.split(":").nth(6);
        if id1_tmp.is_none() || id2_tmp.is_none() {
            return 0;
        }

        // Get the read number. It has to be 1 and 2.
        let id1_read_number = id1_tmp.unwrap().split(" ").nth(1);
        let id2_read_number = id2_tmp.unwrap().split(" ").nth(1);
        if id1_read_number.is_none() || id2_read_number.is_none() {
            return 0;
        }
        if id1_read_number.unwrap() != "1" || id2_read_number.unwrap() != "2" {
            return 0;
        }
    }

    return 1;
}