Source code for pyUSPTO.models.bulk_data

"""models.bulk_data - Data models for USPTO bulk data API.

This module provides data models for the USPTO Open Data Portal (ODP) Bulk Data API.
"""

import json
from dataclasses import dataclass, field
from datetime import date, datetime
from enum import Enum
from typing import Any

from pyUSPTO.models.utils import (
    parse_to_date,
    parse_to_datetime_utc,
    serialize_date,
    serialize_datetime_as_naive,
)


# --- Enums for Categorical Data ---
[docs] class FileTypeCategory(Enum): """File type categories in bulk data. This enum provides type-safe representations of file types commonly found in USPTO bulk data products. """ ZIP = "ZIP" TAR = "TAR" TAR_GZ = "TAR_GZ" TGZ = "TGZ" XML = "XML" JSON = "JSON" CSV = "CSV" @classmethod def _missing_(cls, value: Any) -> "FileTypeCategory | None": """Handle case-insensitive lookup and common aliases.""" if isinstance(value, str): val_upper = value.upper().replace(".", "_") # Handle tar.gz variations if val_upper in ("TAR_GZ", "TARGZ", "TGZ", "TAR.GZ"): return cls.TAR_GZ # Try exact match for member in cls: if member.value.upper() == val_upper: return member return None
[docs] class ProductFrequency(Enum): """Product update frequency categories. Represents how often a bulk data product is updated. """ DAILY = "DAILY" WEEKLY = "WEEKLY" MONTHLY = "MONTHLY" QUARTERLY = "QUARTERLY" ANNUALLY = "ANNUALLY" AD_HOC = "AD_HOC" @classmethod def _missing_(cls, value: Any) -> "ProductFrequency | None": """Handle case-insensitive lookup.""" if isinstance(value, str): val_upper = value.upper().replace(" ", "_").replace("-", "_") for member in cls: if member.value.upper() == val_upper: return member return None
[docs] @dataclass(frozen=True) class FileData: """Represent a file in the bulk data API. Attributes: file_name: The name of the file. file_size: Size of the file in bytes. product_identifier: The identifier of the product this file belongs to. file_data_from_date: Start date of data covered in the file. file_data_to_date: End date of data covered in the file. file_type_text: Description of the file type. file_release_date: Date when the file was released. file_download_uri: URL for downloading the file. file_date: Additional file date information. file_last_modified_date_time: Last modification timestamp. raw_data: Optional raw JSON data from the API response (for debugging). """ file_name: str file_size: int product_identifier: str file_data_from_date: date | None file_data_to_date: date | None file_type_text: str file_release_date: date | None file_download_uri: str | None = None file_date: date | None = None file_last_modified_date_time: datetime | None = None raw_data: str | None = field(default=None, compare=False, repr=False)
[docs] @classmethod def from_dict( cls, data: dict[str, Any], product_identifier: str, include_raw_data: bool = False, ) -> "FileData": """Create a FileData object from a dictionary. Args: data: Dictionary containing file data from API response. product_identifier: The identifier of the product this file belongs to. include_raw_data: If True, store the raw JSON for debugging. Returns: FileData: An instance of FileData. """ return cls( file_name=data.get("fileName", ""), file_size=data.get("fileSize", 0), product_identifier=product_identifier, file_data_from_date=parse_to_date(data.get("fileDataFromDate")), file_data_to_date=parse_to_date(data.get("fileDataToDate")), file_type_text=data.get("fileTypeText", ""), file_release_date=parse_to_date(data.get("fileReleaseDate")), file_download_uri=data.get("fileDownloadURI"), file_date=parse_to_date(data.get("fileDate")), file_last_modified_date_time=parse_to_datetime_utc( data.get("fileLastModifiedDateTime") ), raw_data=json.dumps(data) if include_raw_data else None, )
[docs] def to_dict(self) -> dict[str, Any]: """Convert the FileData object to a dictionary. Returns: Dict[str, Any]: Dictionary representation with camelCase keys. """ d = { "fileName": self.file_name, "fileSize": self.file_size, "fileDataFromDate": serialize_date(self.file_data_from_date), "fileDataToDate": serialize_date(self.file_data_to_date), "fileTypeText": self.file_type_text, "fileReleaseDate": serialize_date(self.file_release_date), "fileDownloadURI": self.file_download_uri, "fileDate": serialize_date(self.file_date), "fileLastModifiedDateTime": ( serialize_datetime_as_naive(self.file_last_modified_date_time) if self.file_last_modified_date_time else None ), } return {k: v for k, v in d.items() if v is not None}
[docs] @dataclass(frozen=True) class ProductFileBag: """Container for file data elements. Attributes: count: The number of files in the bag. file_data_bag: List of FileData objects. raw_data: Optional raw JSON data from the API response (for debugging). """ count: int file_data_bag: list[FileData] = field(default_factory=list) raw_data: str | None = field(default=None, compare=False, repr=False)
[docs] @classmethod def from_dict( cls, data: dict[str, Any], product_identifier: str, include_raw_data: bool = False, ) -> "ProductFileBag": """Create a ProductFileBag object from a dictionary. Args: data: Dictionary containing product file bag data. product_identifier: The identifier of the product this bag belongs to. include_raw_data: If True, store the raw JSON for debugging. Returns: ProductFileBag: An instance of ProductFileBag. """ # Defensive parsing for file_data_bag file_data_bag_raw = data.get("fileDataBag", []) file_data_bag = ( [ FileData.from_dict( file_data, product_identifier=product_identifier, include_raw_data=include_raw_data, ) for file_data in file_data_bag_raw if isinstance(file_data, dict) ] if isinstance(file_data_bag_raw, list) else [] ) return cls( count=data.get("count", 0), file_data_bag=file_data_bag, raw_data=json.dumps(data) if include_raw_data else None, )
[docs] def to_dict(self) -> dict[str, Any]: """Convert the ProductFileBag object to a dictionary. Returns: Dict[str, Any]: Dictionary representation with camelCase keys. """ d = { "count": self.count, "fileDataBag": [f.to_dict() for f in self.file_data_bag], } return { k: v for k, v in d.items() if v is not None and (not isinstance(v, list) or v) }
[docs] @dataclass(frozen=True) class BulkDataProduct: """Represent a product in the bulk data API. Attributes: product_identifier: Unique identifier for the product. product_description_text: Description of the product. product_title_text: Title of the product. product_frequency_text: Update frequency description. product_label_array_text: Labels associated with the product. product_dataset_array_text: Datasets included in the product. product_dataset_category_array_text: Categories of datasets. product_from_date: Start date of data in the product. product_to_date: End date of data in the product. product_total_file_size: Total size of all files in bytes. product_file_total_quantity: Number of files in the product. last_modified_date_time: Last modification timestamp. mime_type_identifier_array_text: MIME types of files in the product. product_file_bag: Container with file data. days_of_week_text: Days of the week for updates (if applicable). raw_data: Optional raw JSON data from the API response (for debugging). """ product_identifier: str product_description_text: str product_title_text: str product_frequency_text: str product_label_array_text: list[str] = field(default_factory=list) product_dataset_array_text: list[str] = field(default_factory=list) product_dataset_category_array_text: list[str] = field(default_factory=list) product_from_date: date | None = None product_to_date: date | None = None product_total_file_size: int = 0 product_file_total_quantity: int = 0 last_modified_date_time: datetime | None = None mime_type_identifier_array_text: list[str] = field(default_factory=list) product_file_bag: ProductFileBag | None = None days_of_week_text: str | None = None raw_data: str | None = field(default=None, compare=False, repr=False)
[docs] @classmethod def from_dict( cls, data: dict[str, Any], include_raw_data: bool = False ) -> "BulkDataProduct": """Create a BulkDataProduct object from a dictionary. Args: data: Dictionary containing product data from API response. include_raw_data: If True, store the raw JSON for debugging. Returns: BulkDataProduct: An instance of BulkDataProduct. """ # Defensive parsing for list fields product_label_array = data.get("productLabelArrayText", []) if not isinstance(product_label_array, list): product_label_array = [] product_dataset_array = data.get("productDatasetArrayText", []) if not isinstance(product_dataset_array, list): product_dataset_array = [] product_dataset_category_array = data.get("productDatasetCategoryArrayText", []) if not isinstance(product_dataset_category_array, list): product_dataset_category_array = [] mime_type_array = data.get("mimeTypeIdentifierArrayText", []) if not isinstance(mime_type_array, list): mime_type_array = [] # Parse product file bag #TODO: this does not seem to be available in search responses. product_file_bag_data = data.get("productFileBag") product_file_bag = ( ProductFileBag.from_dict( product_file_bag_data, product_identifier=data.get("productIdentifier", ""), include_raw_data=include_raw_data, ) if product_file_bag_data and isinstance(product_file_bag_data, dict) else None ) return cls( product_identifier=data.get("productIdentifier", ""), product_description_text=data.get("productDescriptionText", ""), product_title_text=data.get("productTitleText", ""), product_frequency_text=data.get("productFrequencyText", ""), days_of_week_text=data.get("daysOfWeekText"), product_label_array_text=product_label_array, product_dataset_array_text=product_dataset_array, product_dataset_category_array_text=product_dataset_category_array, product_from_date=parse_to_date(data.get("productFromDate")), product_to_date=parse_to_date(data.get("productToDate")), product_total_file_size=data.get("productTotalFileSize", 0), product_file_total_quantity=data.get("productFileTotalQuantity", 0), last_modified_date_time=parse_to_datetime_utc( data.get("lastModifiedDateTime") ), mime_type_identifier_array_text=mime_type_array, product_file_bag=product_file_bag, raw_data=json.dumps(data) if include_raw_data else None, )
[docs] def to_dict(self) -> dict[str, Any]: """Convert the BulkDataProduct object to a dictionary. Returns: Dict[str, Any]: Dictionary representation with camelCase keys. """ d = { "productIdentifier": self.product_identifier, "productDescriptionText": self.product_description_text, "productTitleText": self.product_title_text, "productFrequencyText": self.product_frequency_text, "daysOfWeekText": self.days_of_week_text, "productLabelArrayText": self.product_label_array_text, "productDatasetArrayText": self.product_dataset_array_text, "productDatasetCategoryArrayText": self.product_dataset_category_array_text, "productFromDate": serialize_date(self.product_from_date), "productToDate": serialize_date(self.product_to_date), "productTotalFileSize": self.product_total_file_size, "productFileTotalQuantity": self.product_file_total_quantity, "lastModifiedDateTime": ( serialize_datetime_as_naive(self.last_modified_date_time) if self.last_modified_date_time else None ), "mimeTypeIdentifierArrayText": self.mime_type_identifier_array_text, "productFileBag": ( self.product_file_bag.to_dict() if self.product_file_bag else None ), } return { k: v for k, v in d.items() if v is not None and (not isinstance(v, list) or v) }
[docs] @dataclass(frozen=True) class BulkDataResponse: """Top-level response from the bulk data API. Attributes: count: The number of bulk data products in the response. bulk_data_product_bag: List of bulk data products. raw_data: Optional raw JSON data from the API response (for debugging). """ count: int bulk_data_product_bag: list[BulkDataProduct] = field(default_factory=list) raw_data: str | None = field(default=None, compare=False, repr=False)
[docs] @classmethod def from_dict( cls, data: dict[str, Any], include_raw_data: bool = False ) -> "BulkDataResponse": """Create a BulkDataResponse object from a dictionary. Args: data: Dictionary containing API response data. include_raw_data: If True, store the raw JSON for debugging and propagate to nested models. Returns: BulkDataResponse: An instance of BulkDataResponse. """ # Defensive parsing for bulk_data_product_bag products_data = data.get("bulkDataProductBag", []) products = ( [ BulkDataProduct.from_dict(product, include_raw_data=include_raw_data) for product in products_data if isinstance(product, dict) ] if isinstance(products_data, list) else [] ) return cls( count=data.get("count", 0), bulk_data_product_bag=products, raw_data=json.dumps(data) if include_raw_data else None, )
[docs] def to_dict(self) -> dict[str, Any]: """Convert the BulkDataResponse object to a dictionary. Returns: Dict[str, Any]: Dictionary representation with camelCase keys. """ d = { "count": self.count, "bulkDataProductBag": [ product.to_dict() for product in self.bulk_data_product_bag ], } return { k: v for k, v in d.items() if v is not None and (not isinstance(v, list) or v) }