#!/usr/bin/env perl

=pod

=head1 NAME

filterlinks - filter the link file based on link parameters

=head1 SYNOPSIS

  filterlinks -links linkfile.txt [-nointer] [-nointra] [-debug]

=head1 DESCRIPTION

Parses a Circos link file and applies rules to identify links with specific properties.

=head1 RULES

A link is filtered using the value of one or more of its parameters, such as size or position.

For each parameter, several condition can be defined to determine whether the parameter value is passed.

Additionally the -nointer and -nointra flags can be used to remove
inter-chromosomal links (ends of link are on different chromosomes)
and intra-chromosomal links (ends of link are on the same
chromosome). These two rules short-circuit other decision - if a link
does not pass them, no other rules are tested and the link is
immediately rejected.

=head2 Link Parameters

  link_param = condition1,condition2,...

Because each link has two ends, each link parameter may give rise to three distinct rules

  link_param   = condition1,condition2,...
  link_param_1 = condition1,condition2,...
  link_param_2 = condition1,condition2,...

which test, respectively, both ends, the first end, and the second end. The first end of the link corresponds to the first line of the link line pair. For example, given the link
 
  ...
  link018136 cf12 9800000 9900000
  link018136 hs6 37914056 37916509
  ...

the first end is cf12:9800000-9900000 and the second end is hs6:37914056-37916509.

=over

=item * chr

Applies the condition to the chromosome of the link.

  chr   = 1
  chr_2 = x

=item * start, end, size, rev, inv

Applies the condition to the start, end or size of the link. The link size is end-start+1.

  start = [?<]10000000
  end   = [?>]50000000

To test whether a link end is reversed , use 'rev'. 

  # first end is reversed (start > end)
  rev_1 = 1
  # second end is inverted (start > end)
  rev_2 = 1
  # both ends are reversed
  rev   = 1

If only one of the link's ends is reversed, then the link will be considered inverted. To test this,

  inv   = 1

Note that 'rev' is a property of an end of the link, whereas 'inv' is a property of the link.

=item * span

Applies the condition to the span of the link and should be used with the "s" condition TYPE.

  span = [?i]1000-2000

=item * id

Applies the condition to the id of the link.

=item * color, thickness, z, etc.

Any condition that is not one of id, chr, start, end, size, span is
assumed to be a link option and is applied to the option of the
link. For example, options include color, thickness, and z.

  color = [?e]chr12
  z = [?>]10

=back

=head2 Conditions

A condition has the following format

  { [?TYPE {ID} {!} ] } CONDITION

where elements in { } are optional. Briefly, TYPE is used to indicate
how the CONDITION text should be applied (e.g. regular expression,
integer range, exact match, etc). The ID is used to combine rules so
that their match status is AND'ed together to determine whether the
link passes. The trailing "!" is used to negate the rule (i.e. for the
link to pass, the rule must fail).

=over

=item * Default Condition is a Regular Expression

If no optional elements in the condition are specified, it is treated as a regular expression. For example,

  LINK_PARAM = 12

would apply the regular expression "12" to the link parameter. You can provide a list of conditions with ;; as a delimiter (you can adjust the delimiter in the configuration file).

  LINK_PARAM = 12;;x;;y

which are interpreted as a series of regular expressions used to test
the link parameter. The link will be passed if ANY condition matches

(i.e. match results are OR'ed). If you want match results to be AND'ed
(i.e. multiple rules must match for the link to pass, read below).

The regular expression is case-insensitive.

=item * Adjusting Condition Type

The following conditions types are possible

=over

=item * r - regular expression (default)

  LINK_PARAM = 12
  LINK_PARAM = 12;;x;;y

You can specify the type as a regular expression explicitly with [?r] but this is not necessary because that is the default.

  LINK_PARAM = [?r]12;;[?r]x;;[?r]y

=item * s - integer span

The syntax of the integer range is any string supported by Set::IntSpan.

  LINK_PARAM = [?s]1000-2000
  LINK_PARAM = [?s]1000-2000,3000-4000
  LINK_PARAM = [?s]1000-2000,3000-)
  LINK_PARAM = [?s](-1000,2000,3000-)
  LINK_PARAM = [?s](-1000,2000,3000-4000,5000-)

=item * e - exact match

The exact match is useful for matching chromosome names in cases where regular expressions might match other chromosomes (and you don't want to include anchors in your regular expression).

  LINK_PARAM = [?e]chr1
  LINK_PARAM = [?e]chr1;;[?e]chr2

Note that the condition type must be prefixed to each individual condition, if a list of conditions is supplied.

The exact match is not case-sensitive.

=item * < - less than 

If the value is a number, numerical < is used, otherwise the values are compared based on asciibetic order (e.g. le).

  # LINK_PARAM must be less than 100
  LINK_PARAM = [?<]100

  # LINK_PARAM must be less (in the asciibetic sense) than chr20 (e.g. chr1, chr11, chr111, chr19, etc)
  LINK_PARAM = [?<]chr20

=item * > - greater than

Works just like the less than condition [?<].

=item * mixing condition types

You can have multiple condition types for a parameter. Remember that results of each condition will be OR'ed together.

  LINK_PARAM = 1,[?e]chr5,[?e]chr22

The first condition is a regular expression (by default). The second
and third conditions are exact text matches for chr5 and chr22. Thus,
the LINK_PARAM will pass if (a) it contains a "1", or (b) it is "chr5"
or (c) it is "chr22".

=back

=item * Negating a Condition 

In order to negate a condition, use "!". When "!" is used, the condition must fail for the result to be acceptable.

  # must not match regular expression "1"
  LINK_PARAM = [?r!]1

  # must not be "chr12"
  LINK_PARAM = [?e!]chr12

  # must not be within the range 1000-2000
  LINK_PARAM = [?i!]1000-2000

In order to combine negated conditions with positive ones, you will need to group conditions so that their results are AND'ed.

=item * Grouping Conditions

So far, all condition results were evaluated with OR. In other words,
if you had a list of conditions, the successful pass of any of the
conditions resulted in the link being passed. This is useful if you want to accept multiple values 

  # chr12 or chr14 
  LINK_PARAM = [?e]chr12;;[?e]chr14

However, what if you wanted to match regular expression "1" but not chr14. Here's where the ID field comes in. By tagging multiple conditions with the same ID field the results of each of these conditions is AND'ed together to determine whether the link passes.

  # ID=0 
  # match regular expression "1" AND not be "chr14"
  LINK_PARAM = [?r0]1;;[?e0!]chr14

=back

=head1 EXAMPLES

Below are some examples to get you started. Note the interplay between conditions with IDs and condition without IDs. The former collate conditions into AND'ed sets, which are then in turn OR'ed with other sets and with conditions without IDs.

=head2 Filtering by Chromosomes

To select links in which both ends match regular expression "1"

  chr = 1

So simple. Now, to select links in with either ends matches regular expression "1", 

  chr_1 = 1
  chr_2 = 1

The difference between these two cases is that in the first instance, since the link_parameter does not include a _1 or _2 suffix, the condition is applied to both ends of the link and both ends must pass. In the second case, each end is tested independently and the results are OR'ed together.

If you want links where the first chromosome matches x or the second matches y,

  chr_1 = x
  chr_2 = y

The test is (chr_1 match "x") OR (chr_2 match "y"). Note, however, that this set of rules requires that the first chromosome match "x" OR the second chromosome match "y". It will fail if the first chromosome matches "y" and the second matches "x". To match both possibilities, you might try

  chr_1 = x;;y
  chr_2 = y;;x

In this case the test is (chr_1 match "x") OR (chr_1 match "y") OR (chr_2 match "x") OR (chr_2 match "y"). 

If you are looking for links between x and y chromosomes, then you require the results of each condition to be AND'ed. For this, use IDs

  chr_1 = [?r1]x
  chr_2 = [?r1]y

Both of these rules have ID=1 and are therefore grouped into a set. Match results within a set are AND'ed. Thus, the test is (chr_1 match "x") AND (chr_2 match "y"). If you want to match the other order too,

  chr_1 = [?r1]x;;[?r2]y
  chr_2 = [?r1]y;;[?r2]x

In this example, there are two IDs. The rules with ID=0 match chr1 to "x" and chr2 to "y" and the rules with ID=1 match the converse (chr1 to "y" and chr2 to "x"). 

Now let's suppose we want links that are either cf1-hs6, cf14-hs7 or cfx-hsx. Here cf is a dog chromosome and hs is a human chromosome. The rule for this is

  chr_1 = [?e1]cf1;;[?e2]cf14;;[?e3]cfx
  chr_2 = [?e1]hs6;;[?e2]hs7;;[?e3]hsx

You can add additional conditions without IDs to accept more links. For example, if you also wanted to add any links for which chr_1 was cf9 or for which chr_2 matched "3"

  chr_1 = [?e1]cf1;;[?e2]cf14;;[?e3]cfx;;[?e]cf9
  chr_2 = [?e1]hs6;;[?e2]hs7;;[?e3]hsx;;3

Remember that [?r]3 is the same as 3, since the default condition type is a regular expression.

You can take advantage of the "!" flag to negate rules to avoid chromosomes. For example, if you want links between cfx and any chromosome other than hsx

  chr_1 = [?e1]cfx
  chr_2 = [?e1!]hsx

and here the test is (chr_1 is cfx) AND (chr_2 is not hsx). 

You can combine chr with chr_1/chr_2 rules

  chr   = 2
  chr_1 = [?e1]cfx
  chr_2 = [?e1!]hsx

to produce the test ( (chr_1 is cfx) AND (chr_2 is not hsx) ) OR ( chr_1 matches "2" AND chr_2 matches "2" ). Use "chr" as the parameter if you want to apply the same condition to both ends of th elink and chr_1 and chr_2 to apply different conditions.

=head2 Filtering by Position

To test link position, use the parameters "start", "end" and "span". Both "start" and "end" are ideal for testing with condition type < and >. To select links for which both ends start before 10,000,000

  start = [?<]1e7
  # or
  start = [?<]10000000

to add another OR'ed condition to pass links with start values beyond 100,000,000

  start = [?<]1e7;;[?>]1e8

A more complex test for the "start" and "end" values can be leveled by using the "s" condition type, which tests for membership within a span. This rule

  start = [?i]1e6-2e6,3e6-4e6

will pass links for which both ends are within 1-2Mb or 3-4Mb. Note that the "," in this condition is part of the span and does not create a new condition. To have two conditions, use the ;; delimiter.

  start = [?i]1e6-2e6,3e6-4e6;;[?s]1e7-1.1e7,3e6-4e6

When using the "span" parameter, you should always use the "s" condition type. This will check whether the link span intersects the provided span.

  span = [?s]2e7-5e7

This will select all links whose spans (at both ends) intersect the coordinates 20-50Mb. To be more selective, use the _1 and _2 suffixes.

  span_1 = [?s1]2e7-5e7
  span_2 = [?s1]2e7-2.5e7

will select links joining 20-50Mb regions to 20-25Mb regions. An ID was required here to make the results AND'ed. To avoid certain regions, use the "!" flag

  span = [?s!](-1e7

will avoid all links within the first 10Mb.

=head2 Filtering by Orientation

Link ends which have start>end will have the 'rev' (reversed) parameter set. Thus, to extract links which have the first end reversed, use

  rev_1 = 1

To test links that have both ends inverted, 

  rev_1 = 1
  rev_2 = 1

or  

  rev = 1 

An inverted link has one of its ends reversed. To extract an inverted link, use 'inv'

  inv = 1

Because 'inv' is a property of the link, you do not need to use inv_1 or inv_2.

=head2 Filtering by Link Options

Any link option such as "color", "thickness", or "z" can be tested in similar rules. 

  # links with z value greater than 10
  z = [?>]10 

  # links with z value between 5 and 15
  z = [?s]5-15

=head2 Mixing Conditions and IDs

You can write fairly complex rules by combining different link parameter, rule types and IDs.

For example to apply the following filter

  (
  between (hs1 and cf6) 
  AND
  within 75-80 Mb on hs1
  AND 
  larger than 5kb on hs1
  )

  OR

  (
  larger than 500kb on hs1
  )

use the following rules

  chr_1   = [?e1]cf6
  chr_2   = [?e1]hs1
  span_2  = [?s1]75e6-80e6
  size_2  = [?>1]5e3;;[?>]500e3

=head1 HISTORY

=over

=item * 31 Jan 2012

Added inverted links (start > end)

=item * 23 July 2008

Reworked rules and conditions to include TYPE and ID.

=item * 9 July 2008

Started and versioned.

=back 

=head1 BUGS

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

  Martin Krzywinski
  Genome Sciences Centre
  Vancouver BC Canada
  www.bcgsc.ca
  martink@bcgsc.ca

=cut

################################################################
#
# Copyright 2002-2008 Martin Krzywinski
#
# This file is part of the Genome Sciences Centre Perl code base.
#
# This script is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this script; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
################################################################

################################################################
#                           Martin Krzywinski (martink@bcgsc.ca)
#                                                           2008
################################################################

use strict;
use Config::General;
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use IO::File;
use Math::VecStat qw(sum min max average);
use Memoize;
use Pod::Usage;
use Set::IntSpan;
use Time::HiRes qw(gettimeofday tv_interval);
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";
use vars qw(%OPT %CONF);

################################################################
#
# *** YOUR MODULE IMPORTS HERE
#
################################################################

#memoize("get_num_links");

GetOptions(\%OPT,
	   "links=s",
	   "chr_rx=s",
	   "chr_rx_1=s",
	   "chr_rx_2=s",
	   "size_range=s",
	   "size_range_1=s",
	   "size_range_2=s",
	   "nointra",
	   "nointer",
	   "configfile=s","help","man","debug+");

pod2usage() if $OPT{help};
pod2usage(-verbose=>2) if $OPT{man};
loadconfiguration($OPT{configfile});
populateconfiguration(); # copy command line options to config hash
validateconfiguration(); 
if($CONF{debug} > 1) {
  $Data::Dumper::Pad = "debug parameters";
  $Data::Dumper::Indent = 1;
  $Data::Dumper::Quotekeys = 0;
  $Data::Dumper::Terse = 1;
  print Dumper(\%CONF);
}

my $links = parse_links($CONF{links});

for my $link (@$links) {

  my ($pass,$fail,$idresults);

  $fail ||= 1 if $CONF{nointra} && $link->[0]{chr} eq $link->[1]{chr};
  $fail ||= 1 if $CONF{nointer} && $link->[0]{chr} ne $link->[1]{chr};

  next if $fail;

  printdebug("link test",@{$link->[2]});
  printdebug("link test",@{$link->[3]});

  for my $link_param (sort keys %{$CONF{conditions}}) {
    my $condition = $CONF{conditions}{$link_param};
    my ($link_param,$num) = $link_param =~ /(.*?)_?(\d+)?$/;
    for my $condition (split(/$CONF{delim}/,$condition)) {
      my ($condition_pass,$condition_fail);
      my ($type,$id,$neg_flag,$condition) = $condition =~ /^(?:\[[?]([a-z<>])?(\d+)?(!)?.*\])?(.+)/i;
      $type ||= "r";

      # values to test against
      my @values = fetch_parameter_values($link_param,$num,$link);

      printdebug(sprintf("param %s type %s id %s flag %s condition %s values %s %s",
			$link_param,$type,$id,$neg_flag,$condition,@values));

      #print Dumper($link);

      my $vpass = 1;
      for my $value (@values) {
	if($type eq "r") {
	  $vpass &&= $value =~ /$condition/i;
	} 
	elsif ($type eq "e") {
	  $vpass &&= lc $value eq lc $condition;
	}
	elsif ($type eq "s") {
	  $condition =~ s/(\d*(\.\d*)?)e(\d)/$1*10**$3/ge;
	  if($link_param eq "span") {
	    $vpass &&= Set::IntSpan->new($condition)->intersect($value)->cardinality > 0;
	  } else {
	    $vpass &&= Set::IntSpan->new($condition)->member($value);
	  }
	} 
	elsif ($type eq "<") {
	  #printinfo($value,is_number($value),is_number($condition));
	  if(is_number($value) && is_number($condition)) {
	    $vpass &&= $value < $condition;
	  } else {

	    $vpass &&= $value lt $condition;
	  }
	} 
	elsif ($type eq ">") {
	  #printinfo($value,is_number($value),is_number($condition));
	  if(is_number($value) && is_number($condition)) {
	    $vpass &&= $value > $condition;
	  } else {
	    $vpass &&= $value gt $condition;
	  }
	}
	else {
	  die "condition of type $type cannot be parsed - this type is not defined";
	}
	printdebug(sprintf("value %s pass %d",$value,$vpass));
      }
      $vpass = ! $vpass if $neg_flag;
      if(defined $id) {
	if(defined $idresults->{$id}) {
	  $idresults->{$id} &&= $vpass;
	} else {
	  $idresults->{$id} = $vpass;
	}
      } else {
	$pass ||= $vpass;
      }	
    }
  }

  my $linkpass = 0;
  if( (defined $pass && $pass)
      ||
      (defined $idresults && grep($_, values %$idresults)) ) {
    $linkpass = 1;
  }
  printdebug("link result",$link->[0]{id},$linkpass,@{$link->[2]},@{$link->[3]});
  if($linkpass) {
    printinfo(@{$link->[2]});
    printinfo(@{$link->[3]});
    #printdumper($link);
  }
}

sub is_number {
  my $x = shift;
  return scalar $x =~ /^-?\d*\.?\d*([e]\d+)?$/;
}

sub fetch_parameter_values {
  my ($link_param,$num,$link) = @_;
  my @values;
  for my $i (1,2) {
      next if defined $num && $i != $num;
      if(exists $link->[$i-1]{$link_param}) {
	  push @values, $link->[$i-1]{$link_param};
      }
      elsif (exists $link->[$i-1]{opt}{$link_param}) {
	  push @values, $link->[$i-1]{opt}{$link_param};
      }
  }
  return @values;
}

sub read_karyotype {
  my $file = shift;
  return undef if ! -e $file;
  open(F,$file);
  my $k;
  while(<F>) {
    chomp;
    if(/^chr/) {
      my @tok = split;
      my $chr = $tok[2];
      push @{$k->{chr}}, {chr=>$chr, tok=>\@tok};
    } else {
      push @{$k->{band}}, $_;
    }
  }
  return $k;
}

sub parse_links {
  my $file = shift;
  open(F,$file);
  my $links;
  while(<F>) {
    chomp;
    my @tok1 = split;
    my $line2 = <F>;
    last unless $line2;
    chomp $line2;
    my @tok2 = split(" ",$line2);
    my $link = [ 
	link_end_hash(\@tok1),
	link_end_hash(\@tok2),
	\@tok1,
	\@tok2,
	];
    if( ($link->[0]{rev} && ! $link->[1]{rev})
	||
	($link->[1]{rev} && ! $link->[0]{rev}) ) {
	$link->[0]{inv} = 1;
	$link->[1]{inv} = 1;
    } else {
	$link->[0]{inv} = 0;
	$link->[1]{inv} = 0;
    }
    push @$links, $link;
  }
  return $links;
}

# links with start>end will have the coordinates
# reversed and 'inv' flag set

sub link_end_hash {
  my $tok = shift;
  my $opt = {};
  for my $pair (split(",",$tok->[4])) {
    my ($var,$value) = split("=",$pair);
    $opt->{$var}=$value;
  }
  my ($start,$end) = ($tok->[2],$tok->[3]);
  my $rev = 0;
  if($start > $end) {
      $rev = 1;
      ($start,$end) = ($end,$start);
  }
  my $link = {id=>$tok->[0],
	      chr=>$tok->[1],
	      start=>$start,
	      end=>$end,
	      rev=>$rev,
	      size=>$end-$start+1,
	      span=>Set::IntSpan->new( $start < $end ? "$start-$end" : $start),
	      opt=>$opt};
  return $link;
}

sub validateconfiguration {
    $CONF{delim} ||= ";;";
}

################################################################
#
# *** DO NOT EDIT BELOW THIS LINE ***
#
################################################################

sub populateconfiguration {
  foreach my $key (keys %OPT) {
    $CONF{$key} = $OPT{$key};
  }

  # any configuration fields of the form __XXX__ are parsed and replaced with eval(XXX). The configuration
  # can therefore depend on itself.
  #
  # flag = 10
  # note = __2*$CONF{flag}__ # would become 2*10 = 20

  for my $key (keys %CONF) {
    my $value = $CONF{$key};
    while($value =~ /__([^_].+?)__/g) {
      my $source = "__" . $1 . "__";
      my $target = eval $1;
      $value =~ s/\Q$source\E/$target/g;
      #printinfo($source,$target,$value);
    }
    $CONF{$key} = $value;
  }

}

sub loadconfiguration {
  my $file = shift;
  my ($scriptname) = fileparse($0);
  if(-e $file && -r _) {
    # great the file exists
  } elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) {
    $file = "/home/$ENV{LOGNAME}/.$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/etc/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/../etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/../etc/$scriptname.conf";
  } else {
    return undef;
  }
  $OPT{configfile} = $file;
  my $conf = new Config::General(-ConfigFile=>$file,
				 -AllowMultiOptions=>"yes",
				 -LowerCaseNames=>1,
				 -AutoTrue=>1);
  %CONF = $conf->getall;
}

sub printdebug {
  printinfo("debug",@_)  if $CONF{debug};
}

sub printdumper {
    printinfo(Dumper(@_));
}

sub printinfo {
  printf("%s\n",join(" ",@_));
}

