Source code for recordlinkage.api

from recordlinkage.base import BaseCompare
from recordlinkage.base import BaseIndex
from recordlinkage.compare import Date
from recordlinkage.compare import Exact
from recordlinkage.compare import Geographic
from recordlinkage.compare import Numeric
from recordlinkage.compare import String
from recordlinkage.index import Block
from recordlinkage.index import Full
from recordlinkage.index import Random
from recordlinkage.index import SortedNeighbourhood


class Index(BaseIndex):
    """Class to make an index of record pairs.

    Parameters
    ----------
    algorithms: list
        A list of index algorithm classes. The classes are based on
        :class:`recordlinkage.base.BaseIndexAlgorithm`

    Example
    -------
    Consider two historical datasets with census data to link. The
    datasets are named ``census_data_1980`` and ``census_data_1990``::

        indexer = recordlinkage.Index()
        indexer.block(left_on='first_name', right_on='givenname')
        indexer.index(census_data_1980, census_data_1990)

    """

    def full(self):
        """Add a 'full' index.

        Shortcut of :class:`recordlinkage.index.Full`::

            from recordlinkage.index import Full

            indexer = recordlinkage.Index()
            indexer.add(Full())

        """
        indexer = Full()
        self.add(indexer)

        return self

    def block(self, *args, **kwargs):
        """Add a block index.

        Shortcut of :class:`recordlinkage.index.Block`::

            from recordlinkage.index import Block

            indexer = recordlinkage.Index()
            indexer.add(Block())

        """
        indexer = Block(*args, **kwargs)
        self.add(indexer)

        return self

    def sortedneighbourhood(self, *args, **kwargs):
        """Add a Sorted Neighbourhood Index.

        Shortcut of :class:`recordlinkage.index.SortedNeighbourhood`::

            from recordlinkage.index import SortedNeighbourhood

            indexer = recordlinkage.Index()
            indexer.add(SortedNeighbourhood())

        """
        indexer = SortedNeighbourhood(*args, **kwargs)
        self.add(indexer)

        return self

    def random(self, *args, **kwargs):
        """Add a random index.

        Shortcut of :class:`recordlinkage.index.Random`::

            from recordlinkage.index import Random

            indexer = recordlinkage.Index()
            indexer.add(Random())

        """
        indexer = Random(*args, **kwargs)
        self.add(indexer)

        return self


[docs] class Compare(BaseCompare): """Class to compare record pairs with efficiently. Class to compare the attributes of candidate record pairs. The ``Compare`` class has methods like ``string``, ``exact`` and ``numeric`` to initialise the comparing of the records. The ``compute`` method is used to start the actual comparing. Example ------- Consider two historical datasets with census data to link. The datasets are named ``census_data_1980`` and ``census_data_1990``. The MultiIndex ``candidate_pairs`` contains the record pairs to compare. The record pairs are compared on the first name, last name, sex, date of birth, address, place, and income:: # initialise class comp = recordlinkage.Compare() # initialise similarity measurement algorithms comp.string('first_name', 'name', method='jarowinkler') comp.string('lastname', 'lastname', method='jarowinkler') comp.exact('dateofbirth', 'dob') comp.exact('sex', 'sex') comp.string('address', 'address', method='levenshtein') comp.exact('place', 'place') comp.numeric('income', 'income') # the method .compute() returns the DataFrame with the feature vectors. comp.compute(candidate_pairs, census_data_1980, census_data_1990) Parameters ---------- features : list List of compare algorithms. n_jobs : integer, optional (default=1) The number of jobs to run in parallel for comparing of record pairs. If -1, then the number of jobs is set to the number of cores. indexing_type : string, optional (default='label') The indexing type. The MultiIndex is used to index the DataFrame(s). This can be done with pandas ``.loc`` or with ``.iloc``. Use the value 'label' to make use of ``.loc`` and 'position' to make use of ``.iloc``. The value 'position' is only available when the MultiIndex consists of integers. The value 'position' is much faster. Attributes ---------- features: list A list of algorithms to create features. """
[docs] def exact(self, *args, **kwargs): """Compare attributes of pairs exactly. Shortcut of :class:`recordlinkage.compare.Exact`:: from recordlinkage.compare import Exact indexer = recordlinkage.Compare() indexer.add(Exact()) """ compare = Exact(*args, **kwargs) self.add(compare) return self
[docs] def string(self, *args, **kwargs): """Compare attributes of pairs with string algorithm. Shortcut of :class:`recordlinkage.compare.String`:: from recordlinkage.compare import String indexer = recordlinkage.Compare() indexer.add(String()) """ compare = String(*args, **kwargs) self.add(compare) return self
[docs] def numeric(self, *args, **kwargs): """Compare attributes of pairs with numeric algorithm. Shortcut of :class:`recordlinkage.compare.Numeric`:: from recordlinkage.compare import Numeric indexer = recordlinkage.Compare() indexer.add(Numeric()) """ compare = Numeric(*args, **kwargs) self.add(compare) return self
[docs] def geo(self, *args, **kwargs): """Compare attributes of pairs with geo algorithm. Shortcut of :class:`recordlinkage.compare.Geographic`:: from recordlinkage.compare import Geographic indexer = recordlinkage.Compare() indexer.add(Geographic()) """ compare = Geographic(*args, **kwargs) self.add(compare) return self
[docs] def date(self, *args, **kwargs): """Compare attributes of pairs with date algorithm. Shortcut of :class:`recordlinkage.compare.Date`:: from recordlinkage.compare import Date indexer = recordlinkage.Compare() indexer.add(Date()) """ compare = Date(*args, **kwargs) self.add(compare) return self