#!/usr/bin/php -q // Remove time limit set_time_limit(0); // Increase memory allocation ini_set("memory_limit", "32M"); // Set error reporting level - use PHP's internal error handler error_reporting(E_ALL); // Set up list of words to look for $terms = array('damn', 'boyfriend', 'girlfriend', 'angst', 'depression', 'fuck', 'shit', 'me', 'life', 'nothing', 'ohmigod', 'ew', 'nothing', 'parents', 'school', 'crap', 'cunt', 'bastard', 'love', 'hate', 'everyone', 'lousy', 'lonely', 'mean', 'bitch', 'stupid', 'consience', 'tears', 'despair', 'pants', 'sad', 'trash', 'heavy', 'emotion', 'boring', 'hot', 'boy', 'girl', 'grade', 'teacher'); // Alternatively you can use a blacklist, rather than whitelist approach. // this is a bit fairer from a scientific point of view but nowhere near as interesting. // If you prefer to exclude stop words rather than include only target words, use the following // array instead of the one above, and also switch around lines 108 and 109. //$stopwords = array("a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "computer", "com", "con", "could", "couldnt", "cry", "day", "de", "describe", "detail", "did", "didn", "do", "doesn", "don", "done", "down", "due", "during", "each", "een", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "en", "enough", "etc", "even", "ever", "every", "everywhere", "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "gm", "go", "going", "good", "got", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest", "into", "ik", "is", "it", "its", "itself", "just", "keep", "know", "la", "le", "last", "latter", "latterly", "least", "less", "like", "ll", "ltd", "made", "many", "may", "meanwhile", "might", "mill", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "put", "que", "rather", "re", "really", "said", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "take", "ten", "than", "that", "the", "their", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thin", "things", "think", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "ve", "very", "via", "was", "way", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"); // Set up replacements for common entities that PHP won't convert $entities = array("'", " ", "<", ">", """); $replacements = array("'", " ", "<", ">", "\""); // Arrays for storing data $counts = array(); $prevcounts = array(); $wordschanged = array(); // Output and input file handles $op = false; $ip = false; $buffer = ""; // Infinite loop turns the script into a daemon while(true) { // Open a connection to the Sixapart stream if (!is_resource($ip) or feof($ip)) { $ip = fsockopen("updates.sixapart.com", 80, $errno, $errstr, 10); if (!$ip) { echo "$errstr ($errno)
\n"; exit; } $out = "GET /atom-stream.xml HTTP/1.1\r\n"; $out .= "Host: updates.sixapart.com\r\n"; $out .= "Connection: Close\r\n\r\n"; fwrite($ip, $out); socket_set_blocking($ip,false); } // Open a controller channel to the Meteor server if (!is_resource($op) or feof($op)) { echo "Reconnecting to Meteor\n"; if (!($op = fsockopen("meteorcontroller", 4671, $errno, $errstr, 5))) { echo "Meteor not responding\n"; sleep(5); continue; } socket_set_blocking($op,false); } // Read from Sixapart stream $buffer .= fread($ip, 4096); // Check for new items $nummatches = preg_match_all("//si", $buffer, $matches, PREG_PATTERN_ORDER+PREG_OFFSET_CAPTURE); if ($nummatches > 0) { foreach($matches[0] as $item) { // Extract the content part if (preg_match("/\s*(.*?)\s*<\/content>/si", $item[0], $m)) { // Filter for non-ASCII posts (mainly non-English posts) $nonascii = 0; for ($i=min(50, strlen($m[1])-1); $i >= 0; $i--) { if (ord($m[1][$i]) > 127) $nonascii++; } if ($nonascii < 10) { // Decode and strip tags $content = strip_tags(str_replace($entities, $replacements, (html_entity_decode($m[1])))); // Remove any non-words (phone numbers, ascii art, wierd l33t-speak...) $content = preg_replace("/[^a-z\-]+/i", " ", $content); // Split resulting text into words $words = explode(" ", $content); $keytermcount=0; echo "Indexing... "; foreach ($words as $word) { // Lowercase the word for comparison and check it's not a stop word (over 70% will be) $word = strtolower($word); if (in_array($word, $terms)) { //if (strlen($word) > 1 and !in_array($word, $stopwords)) { // Add the time of this occurence to the occurences list for this word if (!isset($counts[$word])) $counts[$word] = array(); $counts[$word][] = time(); $keytermcount++; // Add the word to the words changed list for quick reference later if (!in_array($word, $wordschanged)) $wordschanged[] = $word; } } echo sizeof($words)." words: ".$keytermcount." key terms.\n"; } } } // Trim the input buffer to remove any whole items that we've processed $cutoff = $matches[0][$nummatches-1][1]+strlen($matches[0][$nummatches-1][0]); echo "Trimming buffer at $cutoff... "; $buffer = substr($buffer, $cutoff); echo "Done\n"; } // Loop over the words that have been updated with new occurences echo "Sending updates\n"; while (!empty($wordschanged)) { $w = array_shift($wordschanged); // Remove any occurences that are older than 5 mins while (sizeof($counts[$w]) and $counts[$w][0] < (time()-300)) array_shift($counts[$w]); // Calcuate the new 5 min rolling average and compare it to the previous one $prev = (isset($prevcounts[$w])) ? $prevcounts[$w] : 0; $now = round(sizeof($counts[$w])/5,1); // If more than 2% different, send the update to Meteor if (abs($now-$prev) > (0.02*$prev)) { $out = "ADDMESSAGE angst {w:'".addslashes($w)."',c:".$now."}\n"; echo "> $out"; fwrite($op, $out); $prevcounts[$w] = $now; } } // Give Meteor time to respond - 10ms usleep(10000); $buf = fread($op, 4096); // Reset the counts arrays $newcounts = $counts; unset($counts); $counts = $newcounts; unset($newcounts); // Sleep for 500ms // Since the rolling average is calculated with second resolution // it's pointless looping more than twice a second echo "Sleeping\n"; usleep(500000); }