#!/usr/bin/perl
# Parser for Parts Registry DAS & FASTA information
# Written on 04/02/2010, by Zabeen Patel

use LWP::Simple; # For url access
use XML::Parser; # For parsing XML

# Important
# first visit http://partsregistry.org/fasta/parts/All_Parts
# and save webpage to the same folder as this script

#
# FASTA file parser
#
# Extracts data for:
# Table: parts_info
# Columns: Part_ID,Status,Part_no,Part_type,Description,Sequence,Size

open(PARTOUT, ">partsinfo.csv") || die $!; # parts_info output file
open(IDOUT, ">partid.txt") || die $!; # create temp list of Part IDs
open(ALLPARTIN, "All_Parts") || die $!; # open FASTA file

$URL1 = "http://partsregistry.org/das/parts/dna/?segment=";
$count = 0;
while(<ALLPARTIN>){
    if ($count < 500){ # enter the number of parts you wish to process
    if (/>/){
        $count +=1;
        />(.+)\s\w\s\d/;
        print IDOUT "$1\n";
        getstore($URL1.$1,"temp1.txt"); # creates temp dna DAS file
        s/>//g; #remove >
        chomp;
        s/"(.+)"//; # find & delete description
        $desc = $1;  # store description
        if ($desc =~ /,/){
            $desc =~ s/,/\s/gi; # remove commas from description
            }
        s/\s/,/gi; # replace all spaces with commas
        print PARTOUT;
        print PARTOUT "$desc,";
        while(<ALLPARTIN>){
            last if (/^\n/); # stop printing when next entry reached
            s/\n//g;
            print PARTOUT; # prints sequence
            }
        open(TEMP1, "temp1.txt"); # open DAS dna info
        while (<TEMP1>){
            if (/<DNA\slength='(\d+)'>/){
                print PARTOUT ",$1"; # print size
                }
            }
        close TEMP1;
        print PARTOUT "\n";   
    }
    print "count = $count\n";
}
}

close ALLPARTIN;
close PARTOUT;
close IDOUT;

#
# DAS XML file parser
#
# Extracts data for:
# Table: seq_feat
# Columns: Part_ID,Feat_ID,Start,End
# Table: feat_info
# Columns: Feat_ID,Label,Feat_type,Orientation,Phase

open(SEQFEATOUT, ">seqfeat.csv") || die $!; # seq_feat output file
open(FEATINFOOUT, ">featinfo.csv") || die $!; # feat_info output file
open(IDIN, "partid.txt")|| die $!; # list of Part IDs

$URL2 = 'http://partsregistry.org/das/parts/features/?segment='; # DAS

while(<IDIN>){
    s/\n//g;
    $ID = $_;
    getstore($URL2.$ID,"temp2.txt"); # stores the webpage in a temp file
    open(TEMP2, "temp2.txt");
    
    $counter = 0;
    while(<TEMP2>){
        if (/<FEATURE/){
            $counter +=1;
              
            if ($counter >= 2) { # only parse files with >1 Feature
                    print SEQFEATOUT "$ID,"; # Part_ID
                    /FEATURE\sid='(\d+)'\slabel='(.+)'/;
                    print SEQFEATOUT "$1,"; # Feat_ID
                    print FEATINFOOUT "$1,$2,"; # Feat_ID, Label
                    while (<TEMP2>){
                        last if (/\/FEATURE/);
                        if (/TYPE\sid='(.+)'\scategory/){
                            print FEATINFOOUT "$1,"; # Feat_type
                            }
                        if (/START>(.+)<\//){
                            print SEQFEATOUT "$1,"; # Start
                            }
                        if (/END>(.+)<\//){
                            print SEQFEATOUT "$1\n"; # End
                            }
                         if (/ORIENTATION>(.?)<\//){
                            print FEATINFOOUT "$1,"; # Orientation
                            }
                        if (/PHASE>(.)<\//){
                            print FEATINFOOUT "$1\n"; # Phase
                            }
                        }
                    }
            }
        }
        
    print "counter = $counter\n";            
    close TEMP2;
}

close IDIN;

# delete temp files
unlink("temp1.txt"); 
unlink("temp2.txt");
unlink("partid.txt");

# The End

