#!/usr/bin/perl #-------------------------------------------------------------------- #-- get_search.pl - Get search results #-- ========================= #-- #-- Author: TheSuggmeister #-- #-- Created : Jan 2010. #-- #-- Caveat: This script is provided as is. I know it's messy, but it works (or at least it did). #-- If you enhance it, re-write it etc, please feel free to email or tweet me so I can update this #-- #-- Description : - 1. Get search phrase (e.g. q=decappeal OR 70077) #-- - 2. Iterate though the 1500 tweets we can do with one call (rrp=100 and page=1 to 15) - use a counter to count tweets #-- - 3. Grab the tweet id and data from each tweet (we could add it to array or file) - Increment counter #-- - 4. When we reach rrp100 and page 15, capture last tweet id and start with again with max_id=tweet_id #-- - 5. Repeat steps 2 though 4 until no more tweets are left (about 2 weeks of data max) - incrementing counter #-- - 6. Display final count #-- #-- References: #-- - Here's a good article on Twitter Search #-- - http://www.ibm.com/developerworks/opensource/library/x-twitsrchapi/index.html #-- #-------------------------------------------------------------------- #-------------------------------------------------------------------- #-- LWP stuff #-------------------------------------------------------------------- use LWP::Simple; require LWP::UserAgent; my $browser = LWP::UserAgent->new; $browser->agent('firefoxbeta/1.01'); #-------------------------------------------------------------------- #-- inits #-------------------------------------------------------------------- # $searchStr = "decappeal+OR+dec.org+OR+70077"; $delim = "\n"; # \n $tweet_id = 0; # current tweet id. reset to 0 on each pass - if it's zero twice, we drop out of the loop $mymax_id = 0; $max_tweet_Id = 0; $Prev_Tweet_Id = 0; $lastLoopTweet_Id = 0; # ugh, messy I know. The last loops tweet id. if it aint changing we'd loop forever #-------------------------------------------------------------------- #-- init counters #-------------------------------------------------------------------- # $count = 1; $my_page = 1; # page 1 to 15 $tweet_count = 0; #-------------------------------------------------------------------- #-- init flags #-- #-- flags? I told you it was messy. #-- $stop_now = don't parse anymore lines in the current $url. We're #-- never going to get any more lines with anything useful. #-- #-- $stop_flag = keep parsing lines for the $url we've grabbed as we #-- probably still need to grab dates, names etc. Don't get #-- any more $url's after this loop though/ #-- #-- $search_phrase_flag = Flag when we get a line with this in it /^search/ #-- if we get two of them. Stop.... now ($stop_now) #-- #-- $firstloop_flag = we do something different on the first pass through #-- this is explained further on in the "code" - ha! code? :-) #-------------------------------------------------------------------- # $stop_now = 0; $stop_flag = 0; $firstloop_flag = 1; #set to zero after first loop of 15 pages $search_phrase_flag = 0; #-------------------------------------------------------- #--- Get the highest tweet id from file #-- #-- I do this because I only want to search for whatever $searchStr #-- is between the latest tweet and the highest tweet_id we searched on last time (or $Prev_Tweet_Id) #-- #-------------------------------------------------------- # # open(DECTWEETFILE_IN, ") { my($DECInFile) = $_; chomp($DECInFile); $Prev_Tweet_Id = int($DECInFile); } close (DECTWEETFILE_IN); #print "Starting max tweet id = ".$Prev_Tweet_Id."\n"; #-------------------------------------------------------- #--- Open data files for append #-- #-- DEC_SEARCH_RAW = raw dump of api output so we can process it off line at a later date if we so wish. #-- DEC_SEARCH_CSV = CSV of the above #-- #-------------------------------------------------------- # # #print "- about to open data file for append\n"; open (DEC_SEARCH_RAW, '>>DEC_SEARCH_RAW.raw'); open (DEC_SEARCH_CSV, '>>DEC_SEARCH_RAW.csv'); #-------------------------------------------------------- #-- Where the main action is #-- #-- #-------------------------------------------------------- # # while ($stop_flag != 1) { #When stop_flag = 1 then drop out. #---------------------------------------------------------------------------------------- #-- Perform Search #-- #-- Q) Why do I want to store search results in the first place? #-- A) "We only index Tweets for about two weeks" from http://help.twitter.com/forums/31935/entries/66018 #-- #-- Q) Why is the first iteration (loop) in the code below special? #-- A) because we don't use &max_id in the first 15 pages. #-- #-- Notes: #-- rpp = number of tweets to return per page, up to a max of 100 #-- page = The page number (starting at 1) to return, up to a max of roughly 1500 results #-- search API uses '&', REST API uses '?' #-- #-- So, what I did was basically #-- 1. Get the first 15 pages of search results from #-- $url = 'http://search.twitter.com/search.atom?q='.$searchStr.'&rpp=100&page='.$my_page; #-- #-- 2. Get another set of 15 pages of search results, but this time start at max_id = whatever the last tweet ID was from step 1. #-- #-- 3. Goto 2 until twitter tells us "enough is enough". Note. search seems to index data for about 2 weeks #-- #-- #-- From http://apiwiki.twitter.com/Things-Every-Developer-Should-Know#6Therearepaginationlimits #-- #-- "Clients may request up to 1,500 statuses via the page and rpp parameters for the search method. #-- The response to a request exceeding this limit will be a status code of 200 and an empty result #-- in the format requested. This artificial limit is in place to ensure the performance of the search #-- system. We also restrict the size of the search index by placing a date limit on the updates we #-- allow you to search. This limit is currently around 1.5 weeks but is dynamic and subject to shrink #-- as the number of tweets per day continues to grow" #-- #-- #------------------------------------------------------------------------------------------ # # if ($firstloop_flag == 1) { $url = 'http://search.twitter.com/search.atom?q='.$searchStr.'&rpp=100&page='.$my_page; # no max_id means we start from the last posted tweet } else { # > first loop $url = 'http://search.twitter.com/search.atom?q='.$searchStr.'&rpp=100&page='.$my_page.'&max_id='.$max_tweet_id; } if ($my_page == 15) { $my_page = 1; # Reset to 1 $max_tweet_id = $tweet_id; # start at max_id = whatever the last tweet ID was from step 1. $firstloop_flag = 0; # gets set on first loop past and then never changes. I know there's better ways to do it. This was quick & dirty. } else { $my_page++; # get sets of 15 pages } #-------------------------------------------------------- #-- LWP getting our $url #-- #-------------------------------------------------------- my $response1 = $browser->get( $url ); $page = $response1->content( ); #-------------------------------------------------------- #-- append DEC_SEARCH_RAW #-- #-------------------------------------------------------- print DEC_SEARCH_RAW $page; @lines=split("$delim",$page); # split data based on our delim, in this case it's \n #-------------------------------------------------------- #-- append DEC_SEARCH_RAW #-- #-------------------------------------------------------- $tweet_id=0; # set the tweet id to 0 foreach $lines (@lines) { if ($stop_now != 1) { #-------------------------------------------------------- #-- parse lines with something #-- #-------------------------------------------------------- while ($lines =~ m/(.*?)<\/id>/g ) { $lines =~ s///; # Trim off at the start of the line $lines =~ s/<\/id>//; # Trim off at the end of the line @id_line = split(/:/, $lines); # Split the line into 3 pieces, the last element is the tweet ID if (@id_line[2] !~ m/^search/) { # ignore first line return as it only contains the search phrase $search_phrase_flag = 0; # since it's not a line containing the search phrase, we can set the flag to 0 $tweet_id = int(@id_line[2]); # convert the tweet ID to an integer #------------------------------------------------------------------ #-- When we reach the start tweet id ($Prev_Tweet_Id) of the #-- last time we ran this script we want to stop....now #-- #------------------------------------------------------------------ if ($tweet_id <= $Prev_Tweet_Id) { $stop_now = 1; $stop_flag = 1; } # end if #------------------------------------------------------------------ #-- if the tweet_id isn't incrementing, it's likely at 0, which #-- means we're getting nothing more back... and therefore we should stop #-- #-- if we are getting something back, do a sanity check to make sure #-- we still have the highest tweet id #-- #-- capture the highest tweet_id we get in $highest_tweet_Id #-- #------------------------------------------------------------------ if ($tweet_id == $lastLoopTweet_Id) { $stop_flag = 1; # get more lines, (.*?)<\/published> etc, but stop after that. } else { $lastLoopTweet_Id = $tweet_id; if ($tweet_id > $highest_tweet_Id) { $highest_tweet_Id = $tweet_id; } } # end if ($tweet_id == $lastLoopTweet_Id) $tweet_count++; #print "count: ".$tweet_count." high: ".$highest_tweet_Id." current: ".$tweet_id."\n"; } else { $search_phrase_flag++; #------------------------------------------------------------------ #-- if we get two lines with ^search in, then we're probably looping #-- and want to stop #-- #-- if we only get ^search once, we're happy #------------------------------------------------------------------ if ($search_phrase_flag >= 2) { $stop_flag = 1; } } #end if (@id_line[2] !~ m/^search/) } # end while id #-------------------------------------------------------- #-- parse lines with something #-- #-------------------------------------------------------- while ($lines =~ m/(.*?)<\/published>/g) { $lines =~ s///; # Trim off at the start of the line $lines =~ s/Z<\/published>//; # Trim off at the end of the line @date_line = split(/T/, $lines); $mydate = @date_line[0]; $mytime = @date_line[1]; #print "date = ".$date." time = ".$time."\n"; } # end while published #-------------------------------------------------------- #-- parse lines with something #-- #-------------------------------------------------------- while ($lines =~ m/(.*?)<\/name>/g ) { $lines =~ s///; # Trim off at the start of the line $lines =~ s/<\/name>//; # Trim off at the end of the line $name = $lines; #print $tweet_count." ".$lines."\n"; } # end while name #-------------------------------------------------------- #-- parse lines with something #-- #-------------------------------------------------------- while ($lines =~ m/(.*?)<\/uri>/g ) { $lines =~ s///; # Trim off at the start of the line $lines =~ s/<\/uri>//; # Trim off at the end of the line $uri = $lines; #print $tweet_count." ".$lines."\n"; #print $tweet_id.",".$mydate.",".$mytime.",".$name.",".$uri."\n"; print DEC_SEARCH_CSV $tweet_id.",".$mydate.",".$mytime.",".$name.",".$uri."\n"; } # end while uri $count++; } # end if ($stop_now != 1) } #end foreach $lines (@lines) } # end while ($stop_flag != 1) #-------------------------------------------------------- #--- Close data files for append #-- #-- DEC_SEARCH_RAW = raw dump of api output so we can process it off line at a later date if we so wish. #-- DEC_SEARCH_CSV = CSV of the above #-- #-------------------------------------------------------- # # close (DEC_SEARCH_RAW); close (DEC_SEARCH_CSV); #-------------------------------------------------------- #---Write out the highest tweet id to the file #-- This is the tweet ID we want to stop at next time we run the script/ #-------------------------------------------------------- # # open (DECTWEETFILE, '>DEC_lastTweetID.txt'); print DECTWEETFILE $highest_tweet_Id."\n"; close (DECTWEETFILE); print "start tweet_id was ".$Prev_Tweet_Id." - End Tweet ID was ".$highest_tweet_Id."\n"; exit;