#!/usr/bin/env /bin/bash

# Calculate the variance between email numbers.

# Title: Calculate variance between email numbers from the CRU archive FOIA2009.zip
# Author: Lance Levsen
# Email: lance@catprint.ca
# Date: Sat Dec  5 20:24:12 CST 2009
# Filename: /home/lance/Desktop/FOIA_Leaked/email_variance.sh

# Create an index number for an array. Zero based.
index=0; 

# Declare indexed array disp
declare -a numbers; 

# Loop thru the numbered files in the mail directory of
# FOIA2009.zip. Cut out the .txt from each file. Drop the number into
# the array and increment the index number.
for i in `ls ~/Desktop/FOIA/mail/ | awk -F. '{print $1}'`; do  
	numbers[${index}]=${i}; 
	index=$((${index}+1)); 
done

# Now, for each element of the array, calculate the variance by
# subtracting the former elment from the next element.

# Get the total elements in the array. Could have used index-1, but
# this works too. 
total_elem=${#numbers[*]}

# Another index, this one 1 based as we're starting with the second
# element for math stuff. Declare another array.
index2=1
declare -a variance

# Remove the old results files and the gnuplot pngs..
/bin/rm variance_results.txt variance_results_verbose.txt *.png

# while < elements, subtract element[index2-1] from element[index2] to
# give us the variance between the two numbers
while [ ${index2} -lt ${total_elem} ]; do
	indexminus=$(( ${index2} - 1 ))
	# Run the math through bc, cause you know, we wouldn't want an
	# overflow error. <cough>
	variance[${index2}]=$(/bin/echo "${numbers[${index2}]} - ${numbers[${indexminus}]}" |/usr/bin/bc)
	# A little text to explain the results 
	/bin/echo -e "Email Number: ${numbers[${index2}]}\tVariance from the last Number: ${variance[${index2}]}" >> ./variance_results_verbose.txt
	# For plotting.
	/bin/echo -e "${variance[${index2}]}" >> ./variance_results.txt
	index2=$((${index2} + 1))
done

# Create a numerically sorted data set
/usr/bin/sort -n  variance_results.txt > variance_sorted_numerically.txt

# Create graphs of variances.
/usr/bin/gnuplot ./gnuplotcmds

# Mean and Std. Deviation.
/usr/bin/awk '{ sum += $1; sumsq += $1*$1 } END { printf "Number of records: %i, Mean: %f, Std Deviation: %f\n", NR, sum/NR, sqrt(sumsq/NR - (sum/NR)^2) }' variance_results.txt > mean_deviation.txt

# All done now.
exit;

