protein-sol全部改成当前路径

40086dd8 · junyu_yao · 40086dd8 · 40086dd8 · 40086dd8 · 40086dd8
Commit 40086dd8 authored Sep 14, 2023 by junyu_yao
11 changed files
--- a/README_sequence_prediction.txt
+++ b/README_sequence_prediction.txt
+October 2017
+Sequence-based prediction code used in University of Manchester protein-sol server.
+Available from download tab at www.protein-sol.manchester.ac.uk.
+see Hebdtich et al (2017) Bioinformatics 33:3098-3100.
+Jim Warwicker and Mex Hebditch, Manchester
+*****************
+   Copyright (C) 2017 Jim Warwicker and Max Hebdtich
+    These programs are free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License.
+    These programs are distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+The code is available 'as is', we are planning further developments to the server
+and code, and here is just a snapshot that should allow interested users to make
+calculations with multiple fasta sequences (as opposed to the single sequence
+operation of the web server).
+*****************
+CODE is in various perl scripts (.pl)
+fasta_seq_reformat_export.pl
+seq_compositions_perc_pipeline_export.pl
+server_prediction_seq_export.pl
+seq_props_ALL_export.pl
+profiles_gather_export.pl
+RUN is initiated with
+'multiple_prediction_wrapper_export.sh sequence_input_file'
+sequence_input_file has something like a fasta format, it will convert to
+paired records of ID and sequence for each entry:
+>blah11
+MVKVYAPASSANMSVGFDVLFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFDKLPSEPRENIVYQCWERFCQE
+>blah22
+MVKVYAPASSANMSVGFDVLFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFDKLPSEPRENIVYQCWERFCQE
+but regular fasta should be OK (fasta_seq_reformat_export to convert).
+OTHER INPUT (DATA) FILES:
+ss_propensities.txt		(sec struc propensities)
+seq_reference_data_NIWA.txt	(fitting to experimental solubility data)
+RUNNING, as set up, will occur with all files in the local directort/
+OUTPUT
+seq_prediction.txt (CSV) contains the data relating to that provided on the server:
+LEGEND records - brief information on features used for the predictions
+HEADERS records - matched with keywords for the data output SEQUENCE records:
+HEADERS PREDICTIONS with SEQUENCE PREDICTIONS
+percent-sol, scaled-sol, population-sol, pI
+HEADERS FEATURES ORIGINAL gives definitions of the features
+HEADERS FEATURES PLOT give short-hand names for the features
+both of these HEADERS lines are matched columns with SEQUENCE WEIGHTS and SEQUENCE DEVIATIONS
+lines, where:
+SEQUENCE WEIGHTS - weights from the fit to experimental data, only 10 features are non-zero
+SEQUENCE DEVIATIONS - these are the z-score (see publication) deviation for each feature
+then follows PROFILE data across successive 21 amino acid windows for:
+Kyte-Doolittle
+Uversky fold value	- plotted on server
+sequence entropy
+windowed net charge	- plotted on server
--- a/blah.txt
+++ b/blah.txt
--- a/fasta_seq_reformat_export.pl
+++ b/fasta_seq_reformat_export.pl
+#!/usr/bin/perl
+use FindBin;
+$file_here = "$FindBin::Bin/blah.txt";
+printf $file_here;
+open (BLAH, ">>$file_here") or die "cannot open blah.txt\n";		# file for comments, throughout server code
+open (IN, "< $FindBin::Bin/reformat.in") or die "cannot open reformat_in\n";
+@in=<IN>;
+close (IN);
+$file_here = "$FindBin::Bin/reformat_out";
+open (OUT, ">$file_here") or die "cannot open reformat_out\n";
+$seq_stored	= 'no';
+$skip_seq	= 'no';
+foreach $line (@in) {
+  chomp $line;
+  @words	= split (" ",$line);
+  if ((substr ($line,0,1) eq '>') or ($words[0] eq '//')) {		# new sequence or a // terminator
+    if ($seq_stored eq 'yes') {			# process and write out the previous sequence, if no errors
+      $seqUC	= uc($seq_here);		# uppercase the sequence
+      $seqlen	= length($seqUC);
+      $seq_new	= "";
+      $naa	= 0;
+$n = 1;	# ?? VERY ODD does the loop not fix starting n at 1 ??
+      for ($n==1; $n<=$seqlen; $n++) {
+        $char_here	= substr($seqUC,$n-1,1);
+        if (($char_here ne ' ') and ($char_here ne '*')) {		# ignore blanks and asterisks
+	  if ($char_here =~ /[ACDEFGHIKLMNPQRSTVWY]/) {
+	    $seq_new	.= $char_here;
+	    $naa++;
+#	  } else {		# not happy, but we can get <cr> at end, and easier to take only aa than skip allowed non-aa
+#	    printf BLAH "**** stopping - fasta_seq_reformat - non 20 aa detected for protein ID = $current_id\n";
+#	    $skip_seq	= 'yes';
+	  }
+	}
+      }
+      if ($skip_seq ne 'yes') {
+        if ($naa >= 21) {
+          printf OUT "$current_id\n";		# print the stored id
+          printf OUT "$seq_new\n";
+        } else {					# no aas
+	  printf BLAH "**** stopping - fasta_seq_reformat - shorter than 21 aas for protein ID = $current_id\n";
+        }
+      }
+      $skip_seq		= 'no';
+    }
+    if (substr ($line,0,1) eq '>') {
+      $current_id	= $line;		# store the new seq ID - printer later, if seq is cleared
+      $seq_stored	= 'yes';		# set up for new sequence
+      $seq_here		= '';
+    }
+  } else {					# add on to curr seq, if not blank
+    $len_here	= length($line);
+    if ($len_here != 0) { 
+      $seq_here .= $line;
+    }
+  }
+}
+if ($seq_stored eq 'yes') {			# print the last seq if it conforms
+  $seqUC	= uc($seq_here);		# uppercase the sequence
+  $seqlen	= length($seqUC);
+  $seq_new	= "";
+  $naa		= 0;
+$n = 1;	# ?? VERY ODD does the loop not fix starting n at 1 ??
+  for ($n==1; $n<=$seqlen; $n++) {
+    $char_here	= substr($seqUC,$n-1,1);
+    if (($char_here ne ' ') and ($char_here ne '*')) {		# ignore blanks and asterisks
+      if ($char_here =~ /[ACDEFGHIKLMNPQRSTVWY]/) {
+        $seq_new	.= $char_here;
+	$naa++;
+      }
+    }
+  }
+  if ($skip_seq ne 'yes') {
+    if ($naa >= 21) {
+      printf OUT "$current_id\n";			# print the stored id
+      printf OUT "$seq_new\n";
+    } else {					# no aas
+      printf BLAH "**** stopping - fasta_seq_reformat - shorter than 21 aas for protein ID = $current_id\n";
+    }
+  }
+  $skip_seq	= 'no';
+}
+close (OUT);
+close (BLAH);
+exit;
--- a/multiple_prediction_wrapper_export.sh
+++ b/multiple_prediction_wrapper_export.sh
+#!/bin/bash
+FASTA_in=$1
+SCRIPTDIR="/opt/workspace/D3/src/static/protein-sol"
+cp $FASTA_in $SCRIPTDIR/reformat.in
+perl $SCRIPTDIR/fasta_seq_reformat_export.pl > $SCRIPTDIR/run.log
+mv $SCRIPTDIR/reformat_out $FASTA_in
+cp $FASTA_in $SCRIPTDIR/composition.in
+perl $SCRIPTDIR/seq_compositions_perc_pipeline_export.pl >> $SCRIPTDIR/run.log
+mv $SCRIPTDIR/composition_all.out $SCRIPTDIR/seq_composition.txt
+perl $SCRIPTDIR/server_prediction_seq_export.pl >> $SCRIPTDIR/run.log
+cp $FASTA_in $SCRIPTDIR/seq_props.in
+perl $SCRIPTDIR/seq_props_ALL_export.pl >> $SCRIPTDIR/run.log
+mv $SCRIPTDIR/seq_prediction.txt $SCRIPTDIR/seq_prediction_OLD.txt
+perl $SCRIPTDIR/profiles_gather_export.pl > $SCRIPTDIR/run.log
+#rm bins.txt reformat.in seq_props.in seq_props.out STYprops.out composition.in seq_prediction_OLD.txt
--- a/profiles_gather_export.pl
+++ b/profiles_gather_export.pl
+#!/usr/bin/perl -w
+use FindBin;
+$file_here = "$FindBin::Bin/blah.txt";
+open (BLAH, ">>$file_here");		# file for comments, throughout server code
+open (my $PROPS, "<", "$FindBin::Bin/STYprops.out") or die "cannot open STYprops.out profiles_gather\n";
+@props	= <$PROPS>;
+close ($PROPS);
+$nseq				= 0;
+foreach $line (@props) {
+  chomp $line;
+  @words			= split (" ",$line);
+  $nwords			= @words;
+  if (exists $words[0]) {			# non-null line
+    if ($words[0] eq 'STARTID') {		# new sequence - store the ID
+      $id_here			= $words[1];
+      $nseq++;
+      $seqs{$id_here}		= $nseq;
+      $nwin			= 0;
+    } elsif ($words[0] eq 'NONP')	{		# a window line, assume associate with id_here
+      $nwin++;
+      $KD[$nseq][$nwin]		= $words[5];
+      $FI[$nseq][$nwin]		= $words[6];
+      $ent[$nseq][$nwin]	= $words[7];
+      $meanQ[$nseq][$nwin]	= $words[10];
+    } elsif ($words[0] eq 'ENDID') {		# have the profiles
+      if ($words[1] ne $id_here) {
+        printf BLAH "**** STOPPING - IDs mismatch in profiles_gather.pl\n";
+	exit;
+      }
+      $numwin[$nseq]		= $nwin;	# store the number of windows (naa - 20) for this sequence
+    }
+  }
+}
+# read in the seq_prediction_OLD.txt file and write out the updated version with profiles added (interstitial)
+$file_here = "$FindBin::Bin/seq_prediction_OLD.txt";
+open (OLD, "<$file_here") or die "cannot open seq_prediction_OLD.txt";
+@old	= <OLD>;
+close (OLD);
+$file_here = "$FindBin::Bin/../../tmp/seq_prediction.txt";
+open (NEW, ">$file_here") or die "cannot open seq_prediction.txt";
+foreach $line (@old) {
+  chomp $line;
+  @words			= split (",",$line);
+  $nwords			= @words;
+  if (exists $words[0]) {			# non-null line
+    if ($words[0] eq 'SEQUENCE DEVIATIONS') {	# write matched seq profiles after this line
+      printf NEW "$line\n";				# write the SEQUENCE DEVIATIONS line
+      $id_here			= $words[1];
+      if (exists $seqs{$id_here}) {
+        $nseq_here		= $seqs{$id_here};
+	$nwin_here		= $numwin[$nseq_here];
+	printf NEW "SEQUENCE PROFILE KyteDoolittle,$id_here";
+	for ($n=1; $n<=$nwin_here; $n++) { printf NEW ",$KD[$nseq_here][$n]"; }
+	printf NEW "\n";
+	printf NEW "SEQUENCE PROFILE Uversky??,$id_here";
+	for ($n=1; $n<=$nwin_here; $n++) { printf NEW ",$FI[$nseq_here][$n]"; }
+	printf NEW "\n";
+	printf NEW "SEQUENCE PROFILE entropy,$id_here";
+	for ($n=1; $n<=$nwin_here; $n++) { printf NEW ",$ent[$nseq_here][$n]"; }
+	printf NEW "\n";
+	printf NEW "SEQUENCE PROFILE charge,$id_here";
+	for ($n=1; $n<=$nwin_here; $n++) { printf NEW ",$meanQ[$nseq_here][$n]"; }
+	printf NEW "\n";
+      } else {
+# ?? - sequence unmatched - what to do?
+      }
+    } else {
+      printf NEW "$line\n";			# all other (than SEQUENCE DEVIATIONS) lines write and carry on
+    }
+  } else {					# null line write
+    printf NEW "$line\n";
+  }
+}
+close (NEW);
+close (BLAH);
+exit;
--- a/seq_composition.txt
+++ b/seq_composition.txt
--- a/seq_compositions_perc_pipeline_export.pl
+++ b/seq_compositions_perc_pipeline_export.pl
--- a/seq_props_ALL_export.pl
+++ b/seq_props_ALL_export.pl
--- a/seq_reference_data.txt
+++ b/seq_reference_data.txt
+# Niwa population 3049 ids matched, and a further 653 excluded by MEMBRANE_EXCLUDE
+# Niwa - following averages and std deviations refer to the 3049-653 population
+# Niwa - following data segment from seq_compositions_perc_pipeline.pl followed by seq_compositions_stats_pipeline w/ STATS_MODE unset
+# Added POP, TOP, LOW to start of transferred data (seq_compositions), for ease of reading
+# Also added the data source to each horizantal data line, for ease of reading
+POP, NIWA, AVG, 53.347, DEV, 33.910, N, 2396, TAG, percentage-solubility
+POP, NIWA, WHOLE-SEQ,headers,K-R,D-E,naa,totperc,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,K+R,D+E,K+R-D-E,K+R+D+E,F+W+Y,pI,KyteDoo,abs-charge,FoldIndex,disorder,entropy,betapropensity
+POP, NIWA, WHOLE-SEQ,average, -0.922, -0.983,307.996,100.000,  9.347,  1.360,  5.654,  6.637,  3.456,  7.021,  2.543,  5.668,  4.947,  9.969,  2.754,  3.921,  4.409,  4.697,  5.868,  5.542,  5.307,  6.847,  1.297,  2.755, 10.815, 12.291, -1.476, 23.106,  7.508,  6.693,  0.475,  0.029,  0.143, -0.084,  4.067,  0.993
+POP, NIWA, WHOLE-SEQ,std-dev,  3.438,  2.759,178.157,  0.000,  2.752,  1.217,  1.614,  2.132,  1.409,  2.287,  1.289,  1.835,  2.327,  2.568,  1.114,  1.557,  1.632,  1.844,  2.183,  1.876,  1.743,  2.017,  0.994,  1.383,  2.923,  2.586,  3.708,  4.088,  2.529,  1.854,  0.025,  0.027,  0.080,  0.021,  0.093,  0.026
+TOP, NIWA, AVG, 113.241, DEV, 8.620, N, 145, TAG, percentage-solubility
+TOP, NIWA, WHOLE-SEQ,headers,K-R,D-E,naa,totperc,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,K+R,D+E,K+R-D-E,K+R+D+E,F+W+Y,pI,KyteDoo,abs-charge,FoldIndex,disorder,entropy,betapropensity
+TOP, NIWA, WHOLE-SEQ,average,  1.042, -1.529,188.055,100.000,  8.993,  1.277,  6.064,  7.593,  2.964,  6.639,  2.259,  5.472,  6.560,  9.202,  3.041,  3.774,  4.125,  4.691,  5.518,  5.770,  5.464,  7.112,  1.052,  2.431, 12.078, 13.657, -1.579, 25.735,  6.447,  6.572,  0.462,  0.047,  0.089, -0.084,  3.995,  0.986
+TOP, NIWA, WHOLE-SEQ,std-dev,  4.125,  3.843,122.263,  0.000,  3.163,  1.502,  2.193,  2.973,  1.697,  2.544,  1.628,  1.989,  3.529,  2.816,  1.642,  1.941,  2.154,  2.685,  2.740,  2.226,  2.097,  2.674,  1.094,  1.514,  4.786,  3.540,  6.298,  5.586,  2.709,  2.227,  0.033,  0.045,  0.109,  0.026,  0.115,  0.032
+LOW, NIWA, AVG, 5.208, DEV, 1.747, N, 125, TAG, percentage-solubility
+LOW, NIWA, WHOLE-SEQ,headers,K-R,D-E,naa,totperc,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,K+R,D+E,K+R-D-E,K+R+D+E,F+W+Y,pI,KyteDoo,abs-charge,FoldIndex,disorder,entropy,betapropensity
+LOW, NIWA, WHOLE-SEQ,average, -2.651, -0.520,437.096,100.000,  8.922,  1.228,  5.438,  5.957,  3.800,  6.932,  2.923,  5.307,  3.812, 10.676,  2.674,  4.060,  4.525,  4.901,  6.463,  5.892,  5.369,  6.001,  1.833,  3.286, 10.275, 11.395, -1.120, 21.670,  8.918,  6.988,  0.473,  0.016,  0.150, -0.082,  4.113,  0.992
+LOW, NIWA, WHOLE-SEQ,std-dev,  2.618,  2.230,204.698,  0.000,  2.227,  0.743,  1.187,  1.547,  1.272,  1.912,  1.117,  1.682,  1.387,  2.481,  0.906,  1.623,  1.274,  1.557,  1.764,  1.517,  1.470,  1.585,  0.956,  1.565,  1.793,  1.623,  1.643,  2.998,  2.599,  1.476,  0.021,  0.011,  0.060,  0.020,  0.056,  0.021
+# calc_flag, signed_diff_zscore, y/yes or n/no to include or not in prediction, taken from p-value of overall correlations
+# also avoid most straight linear combinations of aas in prediction
+# then, in the prediction scheme, for each feature to be included (y/n) we will have:
+# feature x, let xL = x avg in LOW set, xT = x avg in TOP set, and measured prop y, yL = avg in LOW ser, yT = avg in TOP set
+# for feature value x:
+# if x < xL, x=xL or if x > xT, x=xT, and predicted y = yL + [(x-xL)/( xT-xL)]*(yT-yL)
+# followed by linear combination with abs value of zscore_diff for each x/feature
+# our current (hopefully FINAL) list of 10 features for prediction is given in as y/n
+ZDF,NIWA,WHOLE-SEQ,headers,K-R,D-E,naa,totperc,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,K+R,D+E,K+R-D-E,K+R+D+E,F+W+Y,pI,KyteDoo,abs-charge,FoldIndex,disorder,entropy,betapropensity
+ZDF, NIWA, WHOLE-SEQ, zscore-diff,     1.096,    -0.333,    -1.523,     0.000,     0.026,     0.043,     0.370,     0.724,    -0.563,    -0.132,    -0.484,     0.090,     1.118,    -0.557,     0.288,    -0.160,    -0.234,    -0.099,    -0.419,    -0.065,     0.053,     0.522,    -0.762,    -0.555,     0.548,     0.876,    -0.116,     0.947,    -0.931,    -0.225,    -0.390,     1.105,    -0.711,    -0.103,    -1.382,    -0.258
+ZDF, NIWA, WHOLE-SEQ, use_for_prob,y,n,y,n,n,n,n,n,n,n,y,n,n,y,n,n,n,n,n,n,n,y,n,n,n,y,n,n,y,n,n,y,y,n,y,n
--- a/server_prediction_seq_export.pl
+++ b/server_prediction_seq_export.pl
+#!/usr/bin/perl -w
+use FindBin;
+$file_here = "$FindBin::Bin/blah.txt";
+open (BLAH, ">>$file_here");			# file for comments used throughout server code
+# read in the reference data for this dataset.
+$file_here = "$FindBin::Bin/seq_reference_data.txt";
+open (REF, "<$file_here") or die "cannot open seq_reference_data.txt\n";
+@ref	= <REF>;
+close (REF);
+foreach $line_spaces (@ref) {
+  chomp $line_spaces;
+  $line					= $line_spaces;
+  $line					=~ s/\s//g;
+  @words				= split (",",$line);
+  $nwords				= @words;
+  if (exists $words[0]) {			# non-null line
+    if (substr ($line,0,1) ne "#") {		# not comment line
+      if ($words[2] eq 'AVG') {
+	if ($words[0] eq 'POP') {
+	  $population_avg	= $words[3];
+	}
+	if ($nwords != 10) {			# check the number, but not format, of expected fields
+	  printf BLAH "stopping *** number of fields in AVG line mismatch\n";
+	  exit;
+	} else {
+	  $AVG_line{$words[0]}		= 'seen';
+	  if ($words[0] eq 'TOP') { $top_prop_avg	= $words[3]; }
+	  if ($words[0] eq 'LOW') { $low_prop_avg	= $words[3]; }
+	}
+      } elsif ($words[3] eq 'headers') {
+	if ($nwords != 40) {			# check the number, but not format, of expected fields
+	  printf BLAH "stopping *** number of fields in headers line mismatch\n";
+	  exit;
+	} else {
+	  $headers_line{$words[0]}	= 'seen';
+	}
+	if ($words[0] eq 'POP') {
+	  $headers_store			= $line;	# store the headers just once, for results write
+	}
+      } elsif ($words[3] eq 'average') {
+	if ($nwords != 40) {			# check the number, but not format, of expected fields
+	  printf BLAH "stopping *** number of fields in average line mismatch\n";
+	  exit;
+	} else {
+	  $average_line{$words[0]}	= 'seen';
+	}
+        if ($words[0] eq 'TOP') {		# store top and low averages to derive the linear fits (per feature)
+	  $top_avg_line			= $line;
+	} elsif ($words[0] eq 'LOW') {
+	  $low_avg_line			= $line;
+	} elsif ($words[0] eq 'POP') {
+	  $pop_avg_line			= $line;
+	}
+      } elsif ($words[3] eq 'std-dev') {
+	if ($nwords != 40) {			# check the number, but not format, of expected fields
+	  printf BLAH "stopping *** number of fields in std-dev line mismatch\n";
+	  exit;
+	} else {
+	  $std_dev_line{$words[0]}	= 'seen';
+	}
+        if ($words[0] eq 'POP') {		# store population std deviations for features
+	  $pop_dev_line			= $line;
+#unused	} elsif ($words[0] eq 'TOP') {
+#	  $top_dev_line			= $line;
+#	} elsif ($words[0] eq 'LOW') {
+#	  $low_dev_line			= $line;
+	}
+      } elsif (($words[0] eq 'ZDF') and ($words[3] eq 'zscore-diff')) {
+	if ($nwords != 40) {			# check the number, but not format, of expected fields
+	  printf BLAH "stopping *** number of fields in ZDF zscore-diff line mismatch\n";
+	  exit;
+	} else {
+	  $zdiff_line			= $line;
+	  $zdf_values			= 'seen';
+	}
+      } elsif (($words[0] eq 'ZDF') and ($words[3] eq 'use_for_prob')) {
+	if ($nwords != 40) {			# check the number, but not format, of expected fields
+	  printf BLAH "stopping *** number of fields in ZDF use_for_prob line mismatch\n";
+	  exit;
+	} else {
+	  $use_for_prob_line		= $line;
+	  $zdf_use			= 'seen'
+	}
+      }
+    }						# end not comment line
+  }						# end not null line
+}						# end line read
+for $key_here ('POP', 'TOP', 'LOW' ) {
+ if (($headers_line{$key_here} ne 'seen') or ($average_line{$key_here} ne 'seen') or ($std_dev_line{$key_here} ne 'seen')) {
+   printf BLAH "stopping **** headers/average/std_dev line unseen for $key_here\n";
+   exit;
+ }
+ if ($AVG_line{$key_here} ne 'seen') {
+   printf BLAH "stopping **** AVG line unseen for $key_here\n";
+   exit;
+ }
+}
+if (($zdf_values ne 'seen') or ($zdf_use ne 'seen')) {
+  printf BLAH "stopping **** ZDF zscore-diff/use_for_prob line unseen\n";
+}
+# store the reference values needed for prediction
+$low_prop	= $low_prop_avg;
+$top_prop	= $top_prop_avg;
+@lows		= split (",",$low_avg_line);		# values run from 4-39 (inclusive), with array index starting at 0
+@tops		= split (",",$top_avg_line);		# as above
+@pops		= split (",",$pop_avg_line);		# as above
+@pop_devs	= split (",",$pop_dev_line);		# as above
+@zdiffs		= split (",",$zdiff_line);		# as above, zdiff_scores remain signed at this point
+@use		= split (",",$use_for_prob_line);	# as above, values are y,n
+@heads		= split (",",$headers_store);		# as above
+# read and store data values for this sequence set
+$file_here = "$FindBin::Bin/seq_composition.txt";
+open (COMP, "<$file_here") or die "cannot open seq_composition.txt\n";
+$headers_found		= "no";
+$nseqs			= 0;
+while ($line_spaces = <COMP>) {
+  chomp $line_spaces;
+  $line					= $line_spaces;
+  $line					=~ s/\s//g;
+  @words				= split (",",$line);
+  $nwords				= @words;
+  if ($headers_found eq "no") {
+    if ((exists $words[0]) and (exists $words[1])) {
+      if (($words[0] eq "WHOLE-SEQ") and ($words[1] eq "ORF-ID")) {
+        $headers_found	= "yes";
+#        @heads_comp	= @words;		# not used currently
+      }
+    }
+  }
+  if (exists $words[0]) {
+    if ($words[0] eq "WHOLE-SEQ") {		# only process WHOLE-SEQ here - will need other of our code for seq-profiles
+      if ($words[1] ne "ORF-ID") {		# check not on header line
+        if ($nwords != 38) {			# flag, ID, 36 comp etc data fields
+	  printf BLAH "stopping **** fields ne 38 in composition file data line\n";
+	  exit;
+	} else {
+	  $nseqs++;
+	  $ids[$nseqs]			= $words[1];
+	  for ($n=1; $n<=36; $n++) { $data[$nseqs][$n]	= $words[$n+1]; }
+	}
+      }
+    }
+  }
+}
+close (COMP);
+$file_here = "$FindBin::Bin/seq_prediction.txt";
+open (PREDA, ">$file_here") or die "cannot open seq_prediction.txt\n";
+# before the sequence data writes, output things that don't change
+$wt_sum			= 0;
+for ($d=1; $d<=36; $d++) {
+  if ($use[$d+3] eq 'y') {
+    $wt_feature[$d]	= abs ($zdiffs[$d+3]);		# get feature weighting for prediction (mostly zero probably)
+    $wt_sum		+= $wt_feature[$d];
+  } else {
+    $wt_feature[$d]	= 0;
+  }
+}
+printf PREDA "LEGEND 35 sequence features are calculated, including 20 amino acid compositions\n";
+printf PREDA "LEGEND All features are calculated over a sliding 21 amino acid window\n";
+printf PREDA "LEGEND   KmR=KminusR DmE=DminusE KpR=KplusR PmN=K+R-D-E PpN=K+R+D+N aro=F+W+Y\n";
+printf PREDA "LEGEND   fld = border of folded (pos) / unfolded (neg) Uversky et al Proteins 2000 41:415\n";
+printf PREDA "LEGEND   dis = disorder propensity Rune & Linding NAR 2003 31:3701\n";
+printf PREDA "LEGEND   bet = beta strand propensities Costantini et al 2006 BBRC 342:441\n";
+printf PREDA "LEGEND   mem = Kyte-Doolittle hydropathy normalised (-1 to +1) JMB 157:105\n";
+printf PREDA "LEGEND Only a subset of features used for prediction, according to best fit against data\n";
+printf PREDA "LEGEND   underlying solubility data: cell-free expression Niwa et al PNAS 2009 106:4201\n";
+printf PREDA "LEGEND Features used for prediction are scaled 0 - 1 over range in underlying dataset\n";
+printf PREDA "HEADERS PREDICTIONS LINE,ID,percent-sol,scaled-sol,population-sol,pI\n";
+printf PREDA "HEADERS FEATURES ORIGINAL,ID";
+for ($d=1; $d<=3; $d++) { printf PREDA ",$heads[$d+3]"; }
+for ($d=5; $d<=36; $d++) { printf PREDA ",$heads[$d+3]"; }
+printf PREDA "\n";
+printf PREDA "HEADERS FEATURES PLOT,ID,";
+printf PREDA "KmR,DmE,len,";
+for ($d=5; $d<=24; $d++) { printf PREDA "$heads[$d+3],"; }
+printf PREDA "KpR,DpE,PmN,PpN,aro,pI,mem,chr,fld,dis,ent,bet\n";
+printf PREDA "\n";
+for ($n=1; $n<=$nseqs; $n++) {
+  $wt_sum		= 0;
+  $pred_sum		= 0;
+  for ($d=1; $d<=36; $d++) {
+    if ($d != 4) {				# get the values of (X-Xavg-pop) / std-dev-pop i.e. a neg/pos scale
+      $norm_dev[$d]	= ($data[$n][$d] - $pops[$d+3]) / $pop_devs[$d+3];
+    }
+    $wt_feature[$d]	= 0;			# initialise to zero, since will only overwrite if feature used for prediction
+    if ($use[$d+3] eq 'y') {			# we are using this feature (based on correlation analysis)
+      $data_here	= $data[$n][$d];
+      if ($lows[$d+3] <= $tops[$d+3]) {		# range as the words say, bounds check 
+        if ($data[$n][$d] < $lows[$d+3]) { $data_here	= $lows[$d+3]; }
+	if ($data[$n][$d] > $tops[$d+3]) { $data_here	= $tops[$d+3]; }
+      } else {					# inverted range, bounds check the other way around
+        if ($data[$n][$d] > $lows[$d+3]) { $data_here	= $lows[$d+3]; }
+	if ($data[$n][$d] < $tops[$d+3]) { $data_here	= $tops[$d+3]; }
+      }
+      $pred_here	= $low_prop + ($top_prop - $low_prop) * ( ($data_here - $lows[$d+3]) / ($tops[$d+3] - $lows[$d+3]) );
+      $wt_here		= abs ($zdiffs[$d+3]);
+      $pred_sum		+= $wt_here*$pred_here;
+      $wt_sum		+= $wt_here;
+      $wt_feature[$d]	= $wt_here;
+    }
+  }
+  $prediction		= $pred_sum / $wt_sum;
+  $prediction_scaled	= ($prediction - $low_prop) / ($top_prop - $low_prop);
+  $population_scaled	= ($population_avg - $low_prop) / ($top_prop - $low_prop);
+  $pI_here		= $data[$n][30];
+  printf PREDA "SEQUENCE PREDICTIONS,$ids[$n]";			# output sequence-specific data
+  printf PREDA ",%6.3f,%6.3f,%6.3f,%6.3f\n", $prediction,$prediction_scaled,$population_scaled,$pI_here;
+  printf PREDA "SEQUENCE WEIGHTS,$ids[$n]";
+  for ($d=1; $d<=3; $d++) {
+    $wt_print		= $wt_feature[$d]/$wt_sum;
+    printf PREDA ",%6.3f", $wt_print;
+  }
+  for ($d=5; $d<=36; $d++) {
+    $wt_print		= $wt_feature[$d]/$wt_sum;
+    printf PREDA ",%6.3f", $wt_print;
+  }
+  printf PREDA "\n";						# output the devs/std-dev for each feature, for this sequence
+  printf PREDA "SEQUENCE DEVIATIONS,$ids[$n]";
+  for ($d=1; $d<=3; $d++) { printf PREDA ",%6.3f", $norm_dev[$d]; }
+  for ($d=5; $d<=36; $d++) { printf PREDA ",%6.3f", $norm_dev[$d]; }
+  printf PREDA "\n\n";
+}
+close (PREDA);
+close (BLAH);
+exit;
--- a/ss_propensities.txt
+++ b/ss_propensities.txt
+comment amino acid sec struc propensities from
+comment Constantini et al (2006) BBRC 342:441-451
+comment aa3 aa1  freq    Pa    Pb    Pc
+data    ALA   A  7.73  1.39  0.75  0.80
+data    CYS   C  1.84  0.74  1.31  1.05
+data    ASP   D  5.82  0.89  0.55  1.33
+data    GLU   E  6.61  1.35  0.72  0.86
+data    PHE   F  4.05  1.01  1.43  0.76
+data    GLY   G  7.11  0.47  0.65  1.62
+data    HIS   H  2.35  0.92  0.99  1.07
+data    ILE   I  5.66  1.04  1.71  0.59
+data    LYS   K  6.27  1.11  0.83  1.00
+data    LEU   L  8.83  1.32  1.10  0.68
+data    MET   M  2.08  1.21  0.99  0.83
+data    ASN   N  4.50  0.77  0.62  1.39
+data    PRO   P  4.52  0.50  0.44  1.72
+data    GLN   Q  3.94  1.29  0.76  0.89
+data    ARG   R  5.03  1.17  0.91  0.91
+data    SER   S  6.13  0.82  0.85  1.24
+data    THR   T  5.53  0.76  1.23  1.07
+data    VAL   V  6.91  0.89  1.86  0.64
+data    TRP   W  1.51  1.06  1.30  0.79
+data    TYR   Y  3.54  0.95  1.50  0.78
+comment suggested algorithm is simply to sum over
+comment window (eg 7) and take largest value = ss