Source code for qwikidata.datavalue

# Copyright 2019 Kensho Technologies, LLC.
"""Module for Wikidata Datavalues."""

import re
from typing import Dict, Union

from qwikidata import typedefs


def _validate_datavalue_dict(datavalue_dict: typedefs.DatavalueDict) -> None:
    """Raise excpetions if datavalue_dict is not valid."""
    _REQUIRED_KEYS = ["type", "value"]
    for req_key in _REQUIRED_KEYS:
        if req_key not in datavalue_dict:
            raise ValueError(
                "required datavalue_dict keys are {} but only found {}".format(
                    _REQUIRED_KEYS, list(datavalue_dict.keys())
                )
            )

    _VALID_TYPES = frozenset(
        [
            "globecoordinate",
            "monolingualtext",
            "quantity",
            "string",
            "time",
            "wikibase-entityid",
            "wikibase-unmapped-entityid",
        ]
    )
    if datavalue_dict["type"] not in _VALID_TYPES:
        raise ValueError(
            "datavalue datatype={} not in valid datatypes {}.".format(
                datavalue_dict["type"], _VALID_TYPES
            )
        )


[docs]class GlobeCoordinate: """Class for `globecoordinate` datavalues. In this class, the `value` attribute is a mapping with the following keys, * **latitude** (`float` or `str`): the latitude part of the coordinate in degrees * **longitude** (`float` or `str`): the longitude part of the coordinate in degrees * **precision** (`float` or `str`): the coordinate's precision, in (fractions of) degrees * **globe** (`str`): the URI of a reference globe. This would typically refer to a data item on wikidata.org. This is usually just an indication of the celestial body (e.g. Q2 = earth), but could be more specific, like WGS 84 or ED50. * **altitude** (`float` or `str`): Deprecated and no longer used. Will be dropped in the future. Attributes ---------- value: dict mapping that represents a location on a globe datatype: str `globecoordinate` """ def __init__(self, datavalue_dict: typedefs.GlobeCoordinateDatavalueDict) -> None: _validate_datavalue_dict(datavalue_dict) self._datavalue_dict = datavalue_dict self.datatype = datavalue_dict["type"] self.value = datavalue_dict["value"] def __str__(self) -> str: return "GlobeCoordinate(latitude={}, longitude={})".format( self.value["latitude"], self.value["longitude"] )
[docs]class MonolingualText: """Class for `monolingualtext` datavalues. In this class, the `value` attribute is a mapping with the following keys, * **text** (`str`): a string literal * **language** (`str`): a `Wikidata language code`_ .. _Wikidata language code: https://www.wikidata.org/wiki/Help:Wikimedia_language_codes/lists/all Attributes ---------- value: dict mapping that represents text in a specific language datatype: str `monolingualtext` """ def __init__(self, datavalue_dict: typedefs.MonolingualTextDatavalueDict) -> None: _validate_datavalue_dict(datavalue_dict) self._datavalue_dict = datavalue_dict self.datatype = datavalue_dict["type"] self.value = datavalue_dict["value"] def __str__(self) -> str: return "MongolingualText(text={}, language={})".format( self.value["text"], self.value["language"] )
[docs]class Quantity: """Class for `quantity` datavalues. In this class, the `value` attribute is a mapping with the following keys, * **amount** (`str`): The nominal value of the quantity, as an arbitrary precision decimal string. The string always starts with a character indicating the sign of the value, either "+" or "-". * **upperBound** (`str`): Optionally, the upper bound of the quantity's uncertainty interval, using the same notation as the amount field. If not given or null, the uncertainty (or precision) of the quantity is not known. If the upperBound field is given, the lowerBound field must also be given. * **lowerBound** (`str`): Optionally, the lower bound of the quantity's uncertainty interval, using the same notation as the amount field. If not given or null, the uncertainty (or precision) of the quantity is not known. If the lowerBound field is given, the upperBound field must also be given. * **unit** (`str`): the URI of a unit (or "1" to indicate a unit-less quantity). This would typically refer to a data item on wikidata.org, e.g. http://www.wikidata.org/entity/Q712226 for "square kilometer". Attributes ---------- value: dict mapping that represents a numeric quantity datatype: str `quantity` """ def __init__(self, datavalue_dict: typedefs.QuantityDatavalueDict) -> None: _validate_datavalue_dict(datavalue_dict) self._datavalue_dict = datavalue_dict self.datatype = datavalue_dict["type"] self.value = datavalue_dict["value"] def __str__(self) -> str: return "Quantity(amount={}, unit={})".format(self.value["amount"], self.value["unit"])
[docs]class String: """Class for `string` datavalues. Attributes ---------- value: str a string literal datatype: str `string` """ def __init__(self, datavalue_dict: typedefs.StringDatavalueDict) -> None: _validate_datavalue_dict(datavalue_dict) self._datavalue_dict = datavalue_dict self.datatype = datavalue_dict["type"] self.value = datavalue_dict["value"] def __str__(self) -> str: return "String(value={})".format(self.value)
[docs]class Time: """Class for `time` datavalues. In this class, the `value` attribute is a mapping with the following keys, * **time** (`str`): the format and interpretation of this string depends on the calendar model. Currently, only Julian and Gregorian dates are supported. The format used for Gregorian and Julian dates use a notation resembling ISO 8601. E.g. "+1994-01-01T00:00:00Z". The year is represented by at least four digits, zeros are added on the left side as needed. Years BCE are represented as negative numbers, using the historical numbering, in which year 0 is undefined, and the year 1 BCE is represented as -0001, the year 44 BCE is represented as -0044, etc., like XSD 1.0 (ISO 8601:1988) does. Month and day may be 00 if they are unknown or insignificant. The day of the month may have values between 0 and 31 for any month, to accommodate "leap dates" like February 30. Hour, minute, and second are currently unused and should always be 00. * **timezone** (`int`): Signed integer. Currently unused, and should always be 0. * **calendarmodel** (`str`): URI of a calendar model, such as gregorian or julian. Typically given as the URI of a data item on the repository * **precision** (`int`): To what unit is the given date/time significant? Given as an integer indicating one of the following units: * 0: 1 Gigayear * 1: 100 Megayears * 2: 10 Megayears * 3: Megayear * 4: 100 Kiloyears * 5: 10 Kiloyears * 6: Kiloyear * 7: 100 years * 8: 10 years * 9: years * 10: months * 11: days * 12: hours (unused) * 13: minutes (unused) * 14: seconds (unused) Note that the precision should be read as an indicator of the significant parts of the date string, it does not directly specify an interval. That is, 1988-07-13T00:00:00 with precision 8 (decade) will be interpreted as 198?-??-?? and rendered as "1980s". 1981-01-21T00:00:00 with precision 8 would have the exact same interpretation. Thus the two dates are equivalent, since year, month, and days are treated as insignificant. * **before** (`int`): Beginning of an uncertainty range, given in the unit defined by the precision field. This cannot be used to represent a duration. (Currently unused, may be dropped in the future) * **after** (`int`): End of an uncertainty range, given in the unit defined by the precision field. This cannot be used to represent a duration. (Currently unused, may be dropped in the future) Attributes ---------- value: dict mapping that represents a time datatype: str `time` """ def __init__(self, datavalue_dict: typedefs.TimeDatavalueDict) -> None: _validate_datavalue_dict(datavalue_dict) self._datavalue_dict = datavalue_dict self.datatype = datavalue_dict["type"] self.value = datavalue_dict["value"] self.STANDARD_DATE_REGEX = re.compile( r""" (?P<year>[+-]?\d+?)- (?P<month>\d\d)- (?P<day>\d\d)T (?P<hour>\d\d): (?P<minute>\d\d): (?P<second>\d\d)Z?""", re.VERBOSE, ) def __str__(self) -> str: return "Time(time={}, precision={})".format(self.value["time"], self.value["precision"])
[docs] def get_parsed_datetime_dict(self) -> Dict[str, int]: """Return a dictionary representation of the datavalue. Given a Wikidata time string, extract the year, month, and day. Time strings look like this, for examples: '+1838-01-01T00:00:00Z'. See: https://www.wikidata.org/wiki/Help:Dates TODO: Handle truncated dates (like 20 for 20th century) TODO: Allow for partial match if we only know some information like the year Returns ------- dict a dictionary representing the timestring's year, month, and date """ datetime_dict = {} # type: Dict[str, int] timestring = self.value["time"] match = self.STANDARD_DATE_REGEX.fullmatch(timestring) if match: datetime_dict = { "year": int(match.group("year")), "month": int(match.group("month")), "day": int(match.group("day")), "hour": int(match.group("hour")), "minute": int(match.group("minute")), "second": int(match.group("second")), } return datetime_dict
[docs]class WikibaseEntityId: """Class for `wikibase-entityid` datavalues. In this class, the `value` attribute is a mapping with the following keys, * **entity-type** (`str`): one of ["item", "property"] * **id** (`str`): string form of entity id (e.g. "Q42") * **numeric-id** (`int`): integer form of entity id (e.g. 42) Attributes ---------- value: dict mapping that references a Wikidata entity datatype: str `wikibase-entityid` """ def __init__(self, datavalue_dict: typedefs.WikibaseEntityIdDatavalueDict) -> None: _validate_datavalue_dict(datavalue_dict) self._datavalue_dict = datavalue_dict self.datatype = datavalue_dict["type"] self.value = datavalue_dict["value"] def __str__(self) -> str: return "WikibaseEntityId(id={})".format(self.value["id"])
[docs]class WikibaseUnmappedEntityId: """Class for `wikibase-unmapped-entityid` datavalues. In this class, the `value` attribute is a string representing an unmapped wikibase entity id. Attributes ---------- value: str string that references an unmapped entity id datatype: str `wikibase-unmapped-entityid` """ def __init__(self, datavalue_dict: typedefs.WikibaseUnmappedEntityIdDatavalueDict) -> None: _validate_datavalue_dict(datavalue_dict) self._datavalue_dict = datavalue_dict self.datatype = datavalue_dict["type"] self.value = datavalue_dict["value"] def __str__(self) -> str: return "WikibaseUnmappedEntityId(value={})".format(self.value)
_DATAVALUE_TYPE_TO_CLASS = { "globecoordinate": GlobeCoordinate, "monolingualtext": MonolingualText, "quantity": Quantity, "string": String, "time": Time, "wikibase-entityid": WikibaseEntityId, "wikibase-unmapped-entityid": WikibaseUnmappedEntityId, } WikidataDatavalue = Union[ GlobeCoordinate, MonolingualText, Quantity, String, Time, WikibaseEntityId, WikibaseUnmappedEntityId, ]
[docs]def get_datavalue_from_snak_dict(snak_dict: typedefs.SnakDict) -> Union[WikidataDatavalue, None]: """Return a Wikidata Datavalue from a snak dictionary.""" if snak_dict["snaktype"] == "value": datavalue_class = _DATAVALUE_TYPE_TO_CLASS[str(snak_dict["datavalue"]["type"])] return datavalue_class(snak_dict["datavalue"]) elif snak_dict["snaktype"] == "somevalue" or snak_dict["snaktype"] == "novalue": return None else: raise ValueError( 'snaktype must be one of ["value", "somevalue", "novalue"] but got {}.'.format( snak_dict["snaktype"] ) )