Tuesday, February 1, 2011

GeoCoding part 2 - let google do it

Some people have asked me how to get around Google's 2500 record limit when doing geocoding. Well, first off, I don't think they enforce it very hard, but I think a way around the limit that may not be against their TOS (or is a gray area at the most) would be to only do 2500 requests per IP. Now, since most of us do geocodes from servers that have multiple IP's, why not rotate those IP's when geocoding massive amounts of data? After all, the data is for more than one organization usually, right? Let multiple organizations claim it and each use their fair share of Google's geocoder, just remember to give google credit and make sure to show it on a map somewhere on each of those organization's sites to stay in compliance!

Here's a quick and dirty class for geocoding through an IP rotator... It uses the same IP until google tells you you can't make any more requests, then uses a different IP. Must have curl installed to use.


class Error {
function log_message ( $message ) { error_log("GEOCODER: $message"); }
}

class GoogleGeocoder {
var $source_ips = array ( "127.0.0.1" );
var $current_ip_index = NULL;
var $current_ip = NULL;
var $ip_exhausted = false;

var $geo_url = "http://maps.google.com/maps/api/geocode/json";
var $geo_get_params = array("sensor=false");

/**
* @see DB_Row::construct
*/
function __construct () { //{{{
$this->change_ip();
} //}}}

function change_ip ( ) {
// If we already exhausted them from previous iterations, block...
if ( $this->ip_exhausted === true ) { Error::log_message("All ips exhausted, please do not try any more geocodes"); return false; }

// If we have never set the IP, set it for index 0
if ( $this->current_ip === NULL ) { $this->current_ip = $this->source_ips[0]; $this->current_ip_index=0; return true; }

// Otherwise rotate until we hit last one
if ( isset($this->source_ips[++$this->current_ip_index]) ) {
$this->current_ip=$this->source_ips[$this->current_ip_index];
} else {
$this->current_ip_index=false;
$this->ip_exhausted=true;
}
}

//Function to send XML request via curl
function send_request_via_curl($url,$postdata=NULL,$xml=true) {
if ( $this->ip_exhausted ) { Error::log_message("All ips exhausted, please do not try any more geocodes"); return false; }

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
if ( $xml ) curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Content-Type: text/xml"));
// curl_setopt($ch, CURLOPT_HEADER, 1);

if ( ! is_null($postdata) ) {
curl_setopt($ch, CURLOPT_POSTFIELDS, $postdata);
curl_setopt($ch, CURLOPT_POST, 0);
}

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
$response = curl_exec($ch);
usleep(100000);

return $response;
}

function geocode ( $address, $country="" ) {
if ( empty($address) ) { Error::log_message("Empty address"); return false; }
if ( $this->ip_exhausted ) { Error::log_message("All ips exhausted, please do not try any more geocodes"); return false; }

if ( is_array($address) ) { $address=join(', ',$address); }
$address=urlencode(htmlspecialchars($address));

$get_params=$this->geo_get_params;
$get_params[]="address=" . $address;

if ( ! empty($country) ) {
if ( $country == "gb" ) { $country="uk"; }
$get_params[]="region=" . $country;
}

$url=utf8_encode($this->geo_url . "?" . join('&',$get_params));

if ( false !== ($response=$this->send_request_via_curl($url)) ) {
if ( false !== ($result=json_decode($response,true)) ) {
if ( $response["status"] == "OVER_QUERY_LIMIT" || $response["status"] == "REQUEST_DENIED" || $response["status"] == "INVALID_REQUEST" ) {
Error::log_message("Got response " . $response["status"] . ", switching IP's");
if ( false === $this->change_ip() ) {
Error::log_message("change_ip returned false, so geocode() is exiting");
return false;
} else {
return $this->geocode($address,$country);
}
}

return $this->parse_geocode_response($result);
} else {
Error::log_message("could not json_decode() result");
return false;
}
} else {
Error::log_message("curl $url returned false!");
return false;
}
}
function parse_geocode_response ( $json ) {
$result["geo_stat"]=($json["status"] == "OK" ? "200" : "404");
if ( $json["status"] == "OK" ) {
$active_record=$json["results"][0];

$result["geocoder_source"]="google";
$result["geo_lat"]=$active_record["geometry"]["location"]["lat"];
$result["geo_lon"]=$active_record["geometry"]["location"]["lng"];
$result["geo_resolution"]=$active_record["geometry"]["location_type"];
foreach ( $active_record["address_components"] as $key => $info ) {
switch ( $info["types"][0] ) {
case 'street_number': $result["addr_number"]=$info["long_name"]; break;
case 'route': $result["addr_street_name"]=$info["long_name"]; break;
case 'locality': $result["addr_city"]=$info["long_name"]; break;
case 'administrative_area_level_1': $result["addr_state"]=$info["short_name"]; break;
case 'postal_code': $result["addr_zip"]=$info["long_name"]; break;
}
}
$result["addr_street"]=@$result["addr_street_name"];
}

return $result;
}
}

/* Example Usage:
$record["text_address"]="1600 Pensylvania Ave., Washington DC";

$geocoder=new GoogleGeocode();
if ( false !== ($geo_info=$geocoder->geocode($record["text_address"]) ) {
$record=array_merge($record,$geo_info);
} else {
echo "Stop geocoding, google cut you off!\n";
}

print_r($record);

Array(
["text_address"] => "1600 Pensylvania Ave., Washington DC",
["geocoder_source"] => "google",
["geo_lat"] => "x.xxx",
["geo_lon"] => "x.xxx",
["geo_stat"] => "200", // HTTP status codes, 404 means address not found
["geo_resolution"] => "ROOFTOP",
["addr_number"] => 1600,
["addr_street_name"] => "Pensylvania Ave",
["addr_city"] => "Washington",
["addr_state"] => "DC", // I'm just guessing here, maybe MD?
["addr_zip"] => "12345"
);
*/
?>


3 comments:

  1. good stuff, thanks, and followed

    ReplyDelete
  2. Interesting!

    Following!

    blundersfrom6foot2.blogspot.com
    androidnews4u.blogspot.com

    ReplyDelete