#!/usr/bin/perl # tweet/ukgc stream grab .pl v1.0 # 2009-02-16 # Graham Lally # http://twitter.com/exmosis # Grabs all (public) tweets for a specified search term from search.twitter.com. # and outputs a CSV file which can be imported into, e.g. Google Docs. # Not pretty, but works. # PLEASE check http://apiwiki.twitter.com/Search+API+Documentation before use, eespecially if running queries that will return large amounts of tweets. I am not responsible for any mis-use of the Twitter API or anything bad that happens to your house while you run this script in true Dade Murphy style. # Requirements: # - JSON: http://search.cpan.org/dist/JSON/ # - curl: http://curl.haxx.se/ # Run via command line with no arguments. Re-direct into a file to save. # e.g. perl ukgcstream_V1.pl > output.csv # Todo: Add user agent, support for since_id. use JSON; # Fields to pull out from Twitter JSON feed my @item_save_fields = ( 'id', 'from_user_id', 'from_user', 'created_at', 'text', 'profile_image_url' ); my @months = ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'); my %time_count; my $curl = `which curl`; chomp $curl; # API URL call to twitter my $request = 'http://search.twitter.com/search.json'; # Search query to use - currently #ukgc09 (%23 is a hash) # Change this for a different search my $arg = '?q=%23ukgc09'; my $page = ''; my $next_page = ''; # output headers for file $headers = ''; for ($field = 0; $field < scalar @item_save_fields; $field++) { $headers .= $item_save_fields[$field].","; } # Header for tweet character count $headers .= 'count,'; # Header for 10 minute time-band $headers .= 'time_10min,'; # Header for number of tweets in this time band, for Google Timeseries widget $headers .= 'time_count,'; $headers =~ s/,$//; print $headers . "\n"; my @rows; my $row_count = 0; # Keep looping until no next page found (when $arg will = '') while (length($arg) > 0) { $row_count++; my $url = $request . $arg; # print "Requesting: ". $url ."\n"; # Make request via curl my $json_text = `$curl "$url"`; my $json = new JSON; my $perl_scalar = $json->decode($json_text); # print $json->pretty->encode($perl_scalar); my $results = $perl_scalar->{'results'}; my @results = @$results; #print $json->pretty->encode($perl_scalar); # Process results for (my $i = 0; $i < scalar @results; $i++) { my $tweet = $results[$i]; # convert date # Mon, 02 Feb 2009 12:32:12 +0000 my $tweet_date = $tweet->{'created_at'}; $tweet_date =~ s/\s\+0000\s*$//; $tweet_date =~ s/^\w{3},\s//; $tweet_time_10min = $tweet_date; if ($tweet_date =~ /^(\d{2}\s(\w{3})\s(\d{4})\s(\d\d:\d\d:\d\d)$/) { # Reformat date and time $month = $2; for (my $month_n = 0; $month_n < scalar @months; $month_n++) { if ($months[$month_n] eq $month) { $ins_month = $month_n + 1; } } my $tweet_date_day = $3 .'-0'. $ins_month .'-'. $1; my $tweet_date_time = $4; $tweet_date = $tweet_date_day .' '. $tweet_date_time; # re-insert $tweet->{'created_at'} = $tweet_date; # Create date/time 10-minute band value my $tweet_time = $4; $tweet_time_10min = $tweet_time; if ($tweet_time =~ /(\d\d):(\d)(\d):(\d\d)/) { $tweet_time_10min = $tweet_date_day .' '. $1 . ':' . $2 . '5:00'; # round to 5-minute midpoint $time_count{$tweet_time_10min}++; } } # Put row data (without time_count) into array $row = ''; for (my $field = 0; $field < scalar @item_save_fields; $field++) { $row .= '"'. $tweet->{$item_save_fields[$field]} . '",'; } # count $row .= '"'. length($tweet->{'text'}).'",'; # tweet_10min $row .= '"'. $tweet_time_10min .'",'; $row =~ s/,$//; $row =~ s/[^[:ascii:]]+//g; # slide row into first place in array to put into chronological order unshift(@rows, $row); } $next_page = $perl_scalar->{'next_page'}; # print "Next page is: " . $next_page . "\n"; # See if we have a next page to get if (length($next_page) > 0) { $arg = $next_page; } else { $arg = ''; } } # All entries have been chirped, output data including counts for each time band for (my $i = 0; $i < scalar @rows; $i++) { # get time count key my $row_time_count = ''; if ($rows[$i] =~ /,"([^"]+)"$/) { my $time_count_key = $1; $row_time_count = $time_count{$time_count_key}; } print $rows[$i] . ',"'. $row_time_count .'"'."\n"; } print "Finished with ". $row_count ." rows.\n\n";