About

The data quality often must use external database to control the validation of the data. It's often the case with the address cleaning. And what better tools that all the data that you can find on the web.

I live in the Netherlands, then I will let see you how you can access to the website http://www.goudengids.nl to verify your address data.

To accomplish this difficult task, we are going to use the Java Api HttpClient to simulate a browser navigation and scrape (get) the content

I have wrote on this good Api in this article and below is the code to retrieve the web page.

Naturally, you are only allowed to do this with the agreement of the website

Snippet

/**
 * 
 */
package com.gerardnico.dataquality;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.cookie.CookieSpec;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;

public class YellowPage
{
    static final String LOGON_SITE = "www.goudengids.nl";
    static final int    LOGON_PORT = 80;

    public YellowPage() {
        super();
    }

    public static void main(String[] args) throws Exception {

        HttpClient client = new HttpClient();
        client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT, "http");
        client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
        // 'developer.java.sun.com' has cookie compliance problems
        // Their session cookie's domain attribute is in violation of the RFC2109
        // We have to resort to using compatibility cookie policy

        GetMethod authget = new GetMethod("/search/");

        client.executeMethod(authget);
        System.out.println("Search form get: " + authget.getStatusLine().toString()); 
        // release any connection resources used by the method
        authget.releaseConnection();
        // See if we got any cookies
        CookieSpec cookiespec = CookiePolicy.getDefaultSpec();
        Cookie[] initcookies = cookiespec.match(
            LOGON_SITE, LOGON_PORT, "/", false, client.getState().getCookies());
        System.out.println("Initial set of cookies:");    
        if (initcookies.length == 0) {
            System.out.println("None");    
        } else {
            for (int i = 0; i < initcookies.length; i++) {
                System.out.println("- " + initcookies[i].toString());    
            }
        }
        
        PostMethod getDataByPost = new PostMethod("/search/");
        // Prepare login parameters
//        NameValuePair action     	= new NameValuePair("action", "/search.ds");
        NameValuePair newSearch     = new NameValuePair("newSearch", "true");
        NameValuePair locale        = new NameValuePair("locale", "nl_NL");
        NameValuePair advancedSearch     = new NameValuePair("advancedSearch", "true");
        NameValuePair what 	 		= new NameValuePair("what", "");
        NameValuePair where 		= new NameValuePair("where", "");
        NameValuePair advancedSearchWho 		= new NameValuePair("advancedSearchWho", "");
        NameValuePair advancedSearchPostalCode 	= new NameValuePair("advancedSearchPostalCode", "");
        NameValuePair advancedSearchPhoneNumber = new NameValuePair("advancedSearchPhoneNumber", "030 26 28 661");       
        NameValuePair advancedSearchStreet 		= new NameValuePair("advancedSearchStreet", "");
        NameValuePair advancedSearchHouseNumber = new NameValuePair("advancedSearchHouseNumber", "");
        
        getDataByPost.setRequestBody( 
        new NameValuePair[] {newSearch, locale, advancedSearch, what, where, advancedSearchWho, advancedSearchPostalCode, advancedSearchPhoneNumber,
advancedSearchStreet, advancedSearchHouseNumber });
        
        client.executeMethod(getDataByPost);
        System.out.println("Login form post: " + getDataByPost.getStatusLine().toString()); 
        // See if we got any cookies
        // The only way of telling whether logon succeeded is 
        // by finding a session cookie
        Cookie[] logoncookies = cookiespec.match(
            LOGON_SITE, LOGON_PORT, "/", false, client.getState().getCookies());
        System.out.println("Logon cookies:");    
        if (logoncookies.length == 0) {
            System.out.println("None");    
        } else {
            for (int i = 0; i < logoncookies.length; i++) {
                System.out.println("- " + logoncookies[i].toString());    
            }
        }
        // Usually a successful form-based login results in a redicrect to 
        // another url
        int statuscode = getDataByPost.getStatusCode();
        if ((statuscode == HttpStatus.SC_OK) ) {
                        // Retrieve hier the web page and parse it to extract all the adress data (street, number, ...)
                        // You will see that you have also geocoding data (latitude and longitude)
        		OutputStream out = new FileOutputStream("C:/GoudenGids.htm");
	  			BufferedReader br = new BufferedReader(new InputStreamReader(getDataByPost.getResponseBodyAsStream()));
	  	        String readLine;
				while(((readLine = br.readLine()) != null)) {
	  	          System.out.println(" 1 - " + readLine);
	  	          out.write((readLine + "\n").getBytes());
				}		
        } else {
            System.out.println("Invalid Code Status:" + statuscode  );
            System.exit(1);
        }
        // release any connection resources used by the method
        getDataByPost.releaseConnection();
    }

}

In the receveid web page, you can see and parse all the data :

mfinfo.show_name=NewFrontiers
mfinfo.show_location=Nieuwegein%7CUTRECHT
mfinfo.show_heading=Adviesbureaus+automatisering
mfinfo.show_postalcode=3439NS
defaultLat:52.356719
defaultLng:4.901275
detailsUrl":"http://goudengids.nl/bedrijven/Adviesbureaus-automatisering/NewFrontiers/NL_204325996_1000_1.html
"geoCoded":true,"streetAddress":"Grote Wade 38","latitude":52.0467167,"name":"NewFrontiers","city":"Nieuwegein"