#!/usr/bin/perl # Parser for Parts Registry DAS & FASTA information # Written on 04/02/2010, by Zabeen Patel use LWP::Simple; # For url access use XML::Parser; # For parsing XML # Important # first visit http://partsregistry.org/fasta/parts/All_Parts # and save webpage to the same folder as this script # # FASTA file parser # # Extracts data for: # Table: parts_info # Columns: Part_ID,Status,Part_no,Part_type,Description,Sequence,Size open(PARTOUT, ">partsinfo.csv") || die $!; # parts_info output file open(IDOUT, ">partid.txt") || die $!; # create temp list of Part IDs open(ALLPARTIN, "All_Parts") || die $!; # open FASTA file $URL1 = "http://partsregistry.org/das/parts/dna/?segment="; $count = 0; while(){ if ($count < 500){ # enter the number of parts you wish to process if (/>/){ $count +=1; />(.+)\s\w\s\d/; print IDOUT "$1\n"; getstore($URL1.$1,"temp1.txt"); # creates temp dna DAS file s/>//g; #remove > chomp; s/"(.+)"//; # find & delete description $desc = $1; # store description if ($desc =~ /,/){ $desc =~ s/,/\s/gi; # remove commas from description } s/\s/,/gi; # replace all spaces with commas print PARTOUT; print PARTOUT "$desc,"; while(){ last if (/^\n/); # stop printing when next entry reached s/\n//g; print PARTOUT; # prints sequence } open(TEMP1, "temp1.txt"); # open DAS dna info while (){ if (//){ print PARTOUT ",$1"; # print size } } close TEMP1; print PARTOUT "\n"; } print "count = $count\n"; } } close ALLPARTIN; close PARTOUT; close IDOUT; # # DAS XML file parser # # Extracts data for: # Table: seq_feat # Columns: Part_ID,Feat_ID,Start,End # Table: feat_info # Columns: Feat_ID,Label,Feat_type,Orientation,Phase open(SEQFEATOUT, ">seqfeat.csv") || die $!; # seq_feat output file open(FEATINFOOUT, ">featinfo.csv") || die $!; # feat_info output file open(IDIN, "partid.txt")|| die $!; # list of Part IDs $URL2 = 'http://partsregistry.org/das/parts/features/?segment='; # DAS while(){ s/\n//g; $ID = $_; getstore($URL2.$ID,"temp2.txt"); # stores the webpage in a temp file open(TEMP2, "temp2.txt"); $counter = 0; while(){ if (/= 2) { # only parse files with >1 Feature print SEQFEATOUT "$ID,"; # Part_ID /FEATURE\sid='(\d+)'\slabel='(.+)'/; print SEQFEATOUT "$1,"; # Feat_ID print FEATINFOOUT "$1,$2,"; # Feat_ID, Label while (){ last if (/\/FEATURE/); if (/TYPE\sid='(.+)'\scategory/){ print FEATINFOOUT "$1,"; # Feat_type } if (/START>(.+)<\//){ print SEQFEATOUT "$1,"; # Start } if (/END>(.+)<\//){ print SEQFEATOUT "$1\n"; # End } if (/ORIENTATION>(.?)<\//){ print FEATINFOOUT "$1,"; # Orientation } if (/PHASE>(.)<\//){ print FEATINFOOUT "$1\n"; # Phase } } } } } print "counter = $counter\n"; close TEMP2; } close IDIN; # delete temp files unlink("temp1.txt"); unlink("temp2.txt"); unlink("partid.txt"); # The End