#!/usr/bin/perl
# Parser for Parts Registry DAS & FASTA output
# Written on 04/02/2010, by Zabeen Patel

use LWP::Simple; # For url access

# Important
# first visit http://partsregistry.org/fasta/parts/All_Parts
# and save webpage to the same folder as this script

# Part 2: only extracts Seq & Feature info to one large file.

open(SEQFEATOUT, ">seqfeatall.csv") || die $!; # seq_feat output file
open(ALLPARTIN, "All_Parts") || die $!; # open FASTA file

$URL_feat = "http://partsregistry.org/das/parts/features/?segment="; # DAS
$part_count = 0;

while(<ALLPARTIN>){

    if ($part_count < 500){ # enter the number of parts you wish to process
    
    if (/>/){ # entry start
        $part_count +=1;
        
        />(.+)\s\w\s\d/; # find Part_ID
        getstore($URL_feat.$1,"temp_feat.txt"); # stores the XML page in a temp file
        
        #
        # Get Seq_feat
        #
        
        open(TEMP, "temp_feat.txt");
        
        $feat_count = 0;
        while(<TEMP>){
            if (/<FEATURE/){
                $feat_count +=1;
            
                # Print out parsed info
                if ($feat_count >= 2) { # only parse files with >1 Feature
                    print SEQFEATOUT "$part_count,"; # PI_ID
                    /FEATURE\sid='(\d+)'\slabel='(.+)'/;
                    print SEQFEATOUT "$1,$2,"; # Feat_ID, Label
                    while (<TEMP>){
                        last if (/\/FEATURE/);
                        if (/TYPE\sid='(.+)'\scategory/){
                            print SEQFEATOUT "$1,"; # Feat_type
                            }
                        if (/START>(.+)<\//){
                            print SEQFEATOUT "$1,"; # Start
                            }
                        if (/END>(.+)<\//){
                            print SEQFEATOUT "$1,"; # End
                            }
                         if (/ORIENTATION>(.?)<\//){
                            print SEQFEATOUT "$1,"; # Orientation
                            }
                        if (/PHASE>(.)<\//){
                            print SEQFEATOUT "$1\n"; # Phase
                            }
                        }
                    }
            }
        } # close WHILE <TEMP>
    print "part count = $part_count\n"; # parser progress check 
    print "feat_count = $feat_count\n"; # parser progress check         
    close TEMP;

        
    }
    }
}

close ALLPARTIN;
close SEQFEATOUT;


# delete temp files
unlink("temp_feat.txt");

# The End

