Source code for mgwr.sel_bw

# GWR Bandwidth selection class

#x_glob parameter does not yet do anything; it is for semiparametric


__author__ = "Taylor Oshan Tayoshan@gmail.com"

import spreg.user_output as USER
import numpy as np
from scipy.spatial.distance import pdist,squareform
from scipy.optimize import minimize_scalar
from spglm.family import Gaussian, Poisson, Binomial
from spglm.iwls import iwls,_compute_betas_gwr
from .kernels import *
from .gwr import GWR
from .search import golden_section, equal_interval, multi_bw
from .diagnostics import get_AICc, get_AIC, get_BIC, get_CV
from functools import partial

kernels = {1: fix_gauss, 2: adapt_gauss, 3: fix_bisquare, 4:
        adapt_bisquare, 5: fix_exp, 6:adapt_exp}
getDiag = {'AICc': get_AICc,'AIC':get_AIC, 'BIC': get_BIC, 'CV': get_CV}

[docs]class Sel_BW(object): """ Select bandwidth for kernel Methods: p211 - p213, bandwidth selection Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). Geographically weighted regression: the analysis of spatially varying relationships. Parameters ---------- y : array n*1, dependent variable. X_glob : array n*k1, fixed independent variable. X_loc : array n*k2, local independent variable, including constant. coords : list of tuples (x,y) of points used in bandwidth selection family : string GWR model type: 'Gaussian', 'logistic, 'Poisson'' offset : array n*1, the offset variable at the ith location. For Poisson model this term is often the size of the population at risk or the expected size of the outcome in spatial epidemiology Default is None where Ni becomes 1.0 for all locations kernel : string kernel function: 'gaussian', 'bisquare', 'exponetial' fixed : boolean True for fixed bandwidth and False for adaptive (NN) multi : True for multiple (covaraite-specific) bandwidths False for a traditional (same for all covariates) bandwdith; defualt is False. constant : boolean True to include intercept (default) in model and False to exclude intercept. spherical : boolean True for shperical coordinates (long-lat), False for projected coordinates (defalut). Attributes ---------- y : array n*1, dependent variable. X_glob : array n*k1, fixed independent variable. X_loc : array n*k2, local independent variable, including constant. coords : list of tuples (x,y) of points used in bandwidth selection family : string GWR model type: 'Gaussian', 'logistic, 'Poisson'' kernel : string type of kernel used and wether fixed or adaptive fixed : boolean True for fixed bandwidth and False for adaptive (NN) criterion : string bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV' search_method : string bw search method: 'golden', 'interval' bw_min : float min value used in bandwidth search bw_max : float max value used in bandwidth search interval : float interval increment used in interval search tol : float tolerance used to determine convergence max_iter : integer max interations if no convergence to tol multi : True for multiple (covaraite-specific) bandwidths False for a traditional (same for all covariates) bandwdith; defualt is False. constant : boolean True to include intercept (default) in model and False to exclude intercept. offset : array n*1, the offset variable at the ith location. For Poisson model this term is often the size of the population at risk or the expected size of the outcome in spatial epidemiology Default is None where Ni becomes 1.0 for all locations dmat : array n*n, distance matrix between calibration locations used to compute weight matrix sorted_dmat : array n*n, sorted distance matrix between calibration locations used to compute weight matrix. Will be None for fixed bandwidths spherical : boolean True for shperical coordinates (long-lat), False for projected coordinates (defalut). search_params : dict stores search arguments int_score : boolan True if adaptive bandwidth is being used and bandwdith selection should be discrete. False if fixed bandwidth is being used and bandwidth does not have to be discrete. bw : scalar or array-like Derived optimal bandwidth(s). Will be a scalar for GWR (multi=False) and a list of scalars for MGWR (multi=True) with one bandwidth for each covariate. S : array n*n, hat matrix derived from the iterative backfitting algorthim for MGWR during bandwidth selection R : array n*n*k, partial hat matrices derived from the iterative backfitting algoruthm for MGWR during bandwidth selection. There is one n*n matrix for each of the k covariates. params : array n*k, calibrated parameter estimates for MGWR based on the iterative backfitting algorithm - computed and saved here to avoid having to do it again in the MGWR object. Examples -------- >>> import libpysal as ps >>> from mgwr.sel_bw import Sel_BW >>> data = ps.io.open(ps.examples.get_path('GData_utm.csv')) >>> coords = list(zip(data.by_col('X'), data.by_col('Y'))) >>> y = np.array(data.by_col('PctBach')).reshape((-1,1)) >>> rural = np.array(data.by_col('PctRural')).reshape((-1,1)) >>> pov = np.array(data.by_col('PctPov')).reshape((-1,1)) >>> african_amer = np.array(data.by_col('PctBlack')).reshape((-1,1)) >>> X = np.hstack([rural, pov, african_amer]) Golden section search AICc - adaptive bisquare >>> bw = Sel_BW(coords, y, X).search(criterion='AICc') >>> print(bw) 93.0 Golden section search AIC - adaptive Gaussian >>> bw = Sel_BW(coords, y, X, kernel='gaussian').search(criterion='AIC') >>> print(bw) 50.0 Golden section search BIC - adaptive Gaussian >>> bw = Sel_BW(coords, y, X, kernel='gaussian').search(criterion='BIC') >>> print(bw) 62.0 Golden section search CV - adaptive Gaussian >>> bw = Sel_BW(coords, y, X, kernel='gaussian').search(criterion='CV') >>> print(bw) 68.0 Interval AICc - fixed bisquare >>> sel = Sel_BW(coords, y, X, fixed=True) >>> bw = sel.search(search_method='interval', bw_min=211001.0, bw_max=211035.0, interval=2) >>> print(bw) 211025.0 """
[docs] def __init__(self, coords, y, X_loc, X_glob=None, family=Gaussian(), offset=None, kernel='bisquare', fixed=False, multi=False, constant=True, spherical=False): self.coords = coords self.y = y self.X_loc = X_loc if X_glob is not None: self.X_glob = X_glob else: self.X_glob = [] self.family=family self.fixed = fixed self.kernel = kernel if offset is None: self.offset = np.ones((len(y), 1)) else: self.offset = offset * 1.0 self.multi = multi self._functions = [] self.constant = constant self.spherical = spherical self._build_dMat() self.search_params = {}
[docs] def search(self, search_method='golden_section', criterion='AICc', bw_min=None, bw_max=None, interval=0.0, tol=1.0e-6, max_iter=200, init_multi=None, tol_multi=1.0e-5, rss_score=False, max_iter_multi=200, multi_bw_min=[None], multi_bw_max=[None]): """ Method to select one unique bandwidth for a gwr model or a bandwidth vector for a mgwr model. Parameters ---------- criterion : string bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV' search_method : string bw search method: 'golden', 'interval' bw_min : float min value used in bandwidth search bw_max : float max value used in bandwidth search multi_bw_min : list min values used for each covariate in mgwr bandwidth search. Must be either a single value or have one value for each covariate including the intercept multi_bw_max : list max values used for each covariate in mgwr bandwidth search. Must be either a single value or have one value for each covariate including the intercept interval : float interval increment used in interval search tol : float tolerance used to determine convergence max_iter : integer max iterations if no convergence to tol init_multi : float None (default) to initialize MGWR with a bandwidth derived from GWR. Otherwise this option will choose the bandwidth to initialize MGWR with. tol_multi : convergence tolerence for the multiple bandwidth backfitting algorithm; a larger tolerance may stop the algorith faster though it may result in a less optimal model max_iter_multi : max iterations if no convergence to tol for multiple bandwidth backfittign algorithm rss_score : True to use the residual sum of sqaures to evaluate each iteration of the multiple bandwidth backfitting routine and False to use a smooth function; default is False Returns ------- bw : scalar or array optimal bandwidth value or values; returns scalar for multi=False and array for multi=True; ordering of bandwidths matches the ordering of the covariates (columns) of the designs matrix, X """ k = self.X_loc.shape[1] if self.constant: #k is the number of covariates k +=1 self.search_method = search_method self.criterion = criterion self.bw_min = bw_min self.bw_max = bw_max if len(multi_bw_min) == k: self.multi_bw_min = multi_bw_min elif len(multi_bw_min) == 1: self.multi_bw_min = multi_bw_min*k else: raise AttributeError("multi_bw_min must be either a list containing" " a single entry or a list containing an entry for each of k" " covariates including the intercept") if len(multi_bw_max) == k: self.multi_bw_max = multi_bw_max elif len(multi_bw_max) == 1: self.multi_bw_max = multi_bw_max*k else: raise AttributeError("multi_bw_max must be either a list containing" " a single entry or a list containing an entry for each of k" " covariates including the intercept") self.interval = interval self.tol = tol self.max_iter = max_iter self.init_multi = init_multi self.tol_multi = tol_multi self.rss_score = rss_score self.max_iter_multi = max_iter_multi self.search_params['search_method'] = search_method self.search_params['criterion'] = criterion self.search_params['bw_min'] = bw_min self.search_params['bw_max'] = bw_max self.search_params['interval'] = interval self.search_params['tol'] = tol self.search_params['max_iter'] = max_iter #self._check_min_max() if self.fixed: if self.kernel == 'gaussian': ktype = 1 elif self.kernel == 'bisquare': ktype = 3 elif self.kernel == 'exponential': ktype = 5 else: raise TypeError('Unsupported kernel function ', self.kernel) else: if self.kernel == 'gaussian': ktype = 2 elif self.kernel == 'bisquare': ktype = 4 elif self.kernel == 'exponential': ktype = 6 else: raise TypeError('Unsupported kernel function ', self.kernel) if ktype % 2 == 0: int_score = True else: int_score = False self.int_score = int_score #isn't this just self.fixed? if self.multi: self._mbw() self.params = self.bw[3] #params self.S = self.bw[-2] #(n,n) self.R = self.bw[-1] #(n,n,k) else: self._bw() return self.bw[0]
def _build_dMat(self): if self.fixed: self.dmat = cdist(self.coords,self.coords,self.spherical) self.sorted_dmat = None else: self.dmat = cdist(self.coords,self.coords,self.spherical) self.sorted_dmat = np.sort(self.dmat) def _bw(self): gwr_func = lambda bw: getDiag[self.criterion](GWR(self.coords, self.y, self.X_loc, bw, family=self.family, kernel=self.kernel, fixed=self.fixed, constant=self.constant, dmat=self.dmat,sorted_dmat=self.sorted_dmat).fit(searching = True)) self._optimized_function = gwr_func if self.search_method == 'golden_section': a,c = self._init_section(self.X_glob, self.X_loc, self.coords, self.constant) delta = 0.38197 #1 - (np.sqrt(5.0)-1.0)/2.0 self.bw = golden_section(a, c, delta, gwr_func, self.tol, self.max_iter, self.int_score) elif self.search_method == 'interval': self.bw = equal_interval(self.bw_min, self.bw_max, self.interval, gwr_func, self.int_score) elif self.search_method == 'scipy': self.bw_min, self.bw_max = self._init_section(self.X_glob, self.X_loc, self.coords, self.constant) if self.bw_min == self.bw_max: raise Exception('Maximum bandwidth and minimum bandwidth must be distinct for scipy optimizer.') self._optimize_result = minimize_scalar(gwr_func, bounds=(self.bw_min, self.bw_max), method='bounded') self.bw = [self._optimize_result.x, self._optimize_result.fun, []] else: raise TypeError('Unsupported computational search method ', self.search_method) def _mbw(self): y = self.y if self.constant: X = USER.check_constant(self.X_loc) else: X = self.X_loc n, k = X.shape family = self.family offset = self.offset kernel = self.kernel fixed = self.fixed coords = self.coords search_method = self.search_method criterion = self.criterion bw_min = self.bw_min bw_max = self.bw_max multi_bw_min = self.multi_bw_min multi_bw_max = self.multi_bw_max interval = self.interval tol = self.tol max_iter = self.max_iter def gwr_func(y,X,bw): return GWR(coords, y,X,bw,family=family, kernel=kernel, fixed=fixed, offset=offset, constant=False).fit() def bw_func(y,X): return Sel_BW(coords, y,X,X_glob=[], family=family, kernel=kernel, fixed=fixed, offset=offset, constant=False) def sel_func(bw_func, bw_min=None, bw_max=None): return bw_func.search(search_method=search_method, criterion=criterion, bw_min=bw_min, bw_max=bw_max, interval=interval, tol=tol, max_iter=max_iter) self.bw = multi_bw(self.init_multi, y, X, n, k, family, self.tol_multi, self.max_iter_multi, self.rss_score, gwr_func, bw_func, sel_func, multi_bw_min, multi_bw_max) def _init_section(self, X_glob, X_loc, coords, constant): if len(X_glob) > 0: n_glob = X_glob.shape[1] else: n_glob = 0 if len(X_loc) > 0: n_loc = X_loc.shape[1] else: n_loc = 0 if constant: n_vars = n_glob + n_loc + 1 else: n_vars = n_glob + n_loc n = np.array(coords).shape[0] if self.int_score: a = 40 + 2 * n_vars c = n else: sq_dists = pdist(coords) a = np.min(sq_dists)/2.0 c = np.max(sq_dists)*2.0 if self.bw_min is not None: a = self.bw_min if self.bw_max is not None: c = self.bw_max return a, c