#!/usr/bin/perl
# Parser for Parts Registry DAS & FASTA output
# Written on 04/02/2010, by Zabeen Patel

use LWP::Simple; # For url access

# Important
# first visit http://partsregistry.org/fasta/parts/All_Parts
# and save webpage to the same folder as this script

# Part 1: Only extracts Part info.


open(PARTOUT, ">partsinfo.csv") || die $!; # parts_info output file
open(ALLPARTIN, "All_Parts") || die $!; # open FASTA file

$URL_dna = "http://partsregistry.org/das/parts/dna/?segment=";
$part_count = 0;
while(<ALLPARTIN>){
    if ($part_count < 500){ # enter the number of parts you wish to process
    
    if (/>/){ # entry start
        $part_count +=1;
        
        />(.+)\s\w\s\d/; # find Part_ID
        print IDOUT "$1\n"; # export to Part IDs list
        
        getstore($URL_dna.$1,"temp.txt"); # creates temp dna DAS file
        
        s/>//g; # remove >
        chomp;
        
        # Process description
        s/"(.+)"//; # find & delete
        $desc = $1;  # store search
        if ($desc =~ /,/){
            $desc =~ s/,/\s/gi; # remove commas
            }
            
        s/\s/,/gi; # introduce comma delimiter
        
        # export parsed data to output file
        print PARTOUT;
        print PARTOUT "$desc,";
        while(<ALLPARTIN>){
            last if (/^\n/); # stop printing when end of entry reached
            chomp;
            print PARTOUT; # prints sequence
            }
        open(TEMP, "temp.txt"); # open DAS dna info
        while (<TEMP>){
            if (/<DNA\slength='(\d+)'>/){
                print PARTOUT ",$1"; # print size
                }
            }
        close TEMP;
        print PARTOUT "\n";   
    }
    print "part count = $part_count\n"; # parser progress check
}
}

close ALLPARTIN;
close PARTOUT;

# delete temp files
unlink("temp.txt");

# The End

