"""models.bulk_data - Data models for USPTO bulk data API.
This module provides data models for the USPTO Open Data Portal (ODP) Bulk Data API.
"""
import json
from dataclasses import dataclass, field
from datetime import date, datetime
from enum import Enum
from typing import Any
from pyUSPTO.models.utils import (
parse_to_date,
parse_to_datetime_utc,
serialize_date,
serialize_datetime_as_naive,
)
# --- Enums for Categorical Data ---
[docs]
class FileTypeCategory(Enum):
"""File type categories in bulk data.
This enum provides type-safe representations of file types commonly
found in USPTO bulk data products.
"""
ZIP = "ZIP"
TAR = "TAR"
TAR_GZ = "TAR_GZ"
TGZ = "TGZ"
XML = "XML"
JSON = "JSON"
CSV = "CSV"
@classmethod
def _missing_(cls, value: Any) -> "FileTypeCategory | None":
"""Handle case-insensitive lookup and common aliases."""
if isinstance(value, str):
val_upper = value.upper().replace(".", "_")
# Handle tar.gz variations
if val_upper in ("TAR_GZ", "TARGZ", "TGZ", "TAR.GZ"):
return cls.TAR_GZ
# Try exact match
for member in cls:
if member.value.upper() == val_upper:
return member
return None
[docs]
class ProductFrequency(Enum):
"""Product update frequency categories.
Represents how often a bulk data product is updated.
"""
DAILY = "DAILY"
WEEKLY = "WEEKLY"
MONTHLY = "MONTHLY"
QUARTERLY = "QUARTERLY"
ANNUALLY = "ANNUALLY"
AD_HOC = "AD_HOC"
@classmethod
def _missing_(cls, value: Any) -> "ProductFrequency | None":
"""Handle case-insensitive lookup."""
if isinstance(value, str):
val_upper = value.upper().replace(" ", "_").replace("-", "_")
for member in cls:
if member.value.upper() == val_upper:
return member
return None
[docs]
@dataclass(frozen=True)
class FileData:
"""Represent a file in the bulk data API.
Attributes:
file_name: The name of the file.
file_size: Size of the file in bytes.
product_identifier: The identifier of the product this file belongs to.
file_data_from_date: Start date of data covered in the file.
file_data_to_date: End date of data covered in the file.
file_type_text: Description of the file type.
file_release_date: Date when the file was released.
file_download_uri: URL for downloading the file.
file_date: Additional file date information.
file_last_modified_date_time: Last modification timestamp.
raw_data: Optional raw JSON data from the API response (for debugging).
"""
file_name: str
file_size: int
product_identifier: str
file_data_from_date: date | None
file_data_to_date: date | None
file_type_text: str
file_release_date: date | None
file_download_uri: str | None = None
file_date: date | None = None
file_last_modified_date_time: datetime | None = None
raw_data: str | None = field(default=None, compare=False, repr=False)
[docs]
@classmethod
def from_dict(
cls,
data: dict[str, Any],
product_identifier: str,
include_raw_data: bool = False,
) -> "FileData":
"""Create a FileData object from a dictionary.
Args:
data: Dictionary containing file data from API response.
product_identifier: The identifier of the product this file belongs to.
include_raw_data: If True, store the raw JSON for debugging.
Returns:
FileData: An instance of FileData.
"""
return cls(
file_name=data.get("fileName", ""),
file_size=data.get("fileSize", 0),
product_identifier=product_identifier,
file_data_from_date=parse_to_date(data.get("fileDataFromDate")),
file_data_to_date=parse_to_date(data.get("fileDataToDate")),
file_type_text=data.get("fileTypeText", ""),
file_release_date=parse_to_date(data.get("fileReleaseDate")),
file_download_uri=data.get("fileDownloadURI"),
file_date=parse_to_date(data.get("fileDate")),
file_last_modified_date_time=parse_to_datetime_utc(
data.get("fileLastModifiedDateTime")
),
raw_data=json.dumps(data) if include_raw_data else None,
)
[docs]
def to_dict(self) -> dict[str, Any]:
"""Convert the FileData object to a dictionary.
Returns:
Dict[str, Any]: Dictionary representation with camelCase keys.
"""
d = {
"fileName": self.file_name,
"fileSize": self.file_size,
"fileDataFromDate": serialize_date(self.file_data_from_date),
"fileDataToDate": serialize_date(self.file_data_to_date),
"fileTypeText": self.file_type_text,
"fileReleaseDate": serialize_date(self.file_release_date),
"fileDownloadURI": self.file_download_uri,
"fileDate": serialize_date(self.file_date),
"fileLastModifiedDateTime": (
serialize_datetime_as_naive(self.file_last_modified_date_time)
if self.file_last_modified_date_time
else None
),
}
return {k: v for k, v in d.items() if v is not None}
[docs]
@dataclass(frozen=True)
class ProductFileBag:
"""Container for file data elements.
Attributes:
count: The number of files in the bag.
file_data_bag: List of FileData objects.
raw_data: Optional raw JSON data from the API response (for debugging).
"""
count: int
file_data_bag: list[FileData] = field(default_factory=list)
raw_data: str | None = field(default=None, compare=False, repr=False)
[docs]
@classmethod
def from_dict(
cls,
data: dict[str, Any],
product_identifier: str,
include_raw_data: bool = False,
) -> "ProductFileBag":
"""Create a ProductFileBag object from a dictionary.
Args:
data: Dictionary containing product file bag data.
product_identifier: The identifier of the product this bag belongs to.
include_raw_data: If True, store the raw JSON for debugging.
Returns:
ProductFileBag: An instance of ProductFileBag.
"""
# Defensive parsing for file_data_bag
file_data_bag_raw = data.get("fileDataBag", [])
file_data_bag = (
[
FileData.from_dict(
file_data,
product_identifier=product_identifier,
include_raw_data=include_raw_data,
)
for file_data in file_data_bag_raw
if isinstance(file_data, dict)
]
if isinstance(file_data_bag_raw, list)
else []
)
return cls(
count=data.get("count", 0),
file_data_bag=file_data_bag,
raw_data=json.dumps(data) if include_raw_data else None,
)
[docs]
def to_dict(self) -> dict[str, Any]:
"""Convert the ProductFileBag object to a dictionary.
Returns:
Dict[str, Any]: Dictionary representation with camelCase keys.
"""
d = {
"count": self.count,
"fileDataBag": [f.to_dict() for f in self.file_data_bag],
}
return {
k: v
for k, v in d.items()
if v is not None and (not isinstance(v, list) or v)
}
[docs]
@dataclass(frozen=True)
class BulkDataProduct:
"""Represent a product in the bulk data API.
Attributes:
product_identifier: Unique identifier for the product.
product_description_text: Description of the product.
product_title_text: Title of the product.
product_frequency_text: Update frequency description.
product_label_array_text: Labels associated with the product.
product_dataset_array_text: Datasets included in the product.
product_dataset_category_array_text: Categories of datasets.
product_from_date: Start date of data in the product.
product_to_date: End date of data in the product.
product_total_file_size: Total size of all files in bytes.
product_file_total_quantity: Number of files in the product.
last_modified_date_time: Last modification timestamp.
mime_type_identifier_array_text: MIME types of files in the product.
product_file_bag: Container with file data.
days_of_week_text: Days of the week for updates (if applicable).
raw_data: Optional raw JSON data from the API response (for debugging).
"""
product_identifier: str
product_description_text: str
product_title_text: str
product_frequency_text: str
product_label_array_text: list[str] = field(default_factory=list)
product_dataset_array_text: list[str] = field(default_factory=list)
product_dataset_category_array_text: list[str] = field(default_factory=list)
product_from_date: date | None = None
product_to_date: date | None = None
product_total_file_size: int = 0
product_file_total_quantity: int = 0
last_modified_date_time: datetime | None = None
mime_type_identifier_array_text: list[str] = field(default_factory=list)
product_file_bag: ProductFileBag | None = None
days_of_week_text: str | None = None
raw_data: str | None = field(default=None, compare=False, repr=False)
[docs]
@classmethod
def from_dict(
cls, data: dict[str, Any], include_raw_data: bool = False
) -> "BulkDataProduct":
"""Create a BulkDataProduct object from a dictionary.
Args:
data: Dictionary containing product data from API response.
include_raw_data: If True, store the raw JSON for debugging.
Returns:
BulkDataProduct: An instance of BulkDataProduct.
"""
# Defensive parsing for list fields
product_label_array = data.get("productLabelArrayText", [])
if not isinstance(product_label_array, list):
product_label_array = []
product_dataset_array = data.get("productDatasetArrayText", [])
if not isinstance(product_dataset_array, list):
product_dataset_array = []
product_dataset_category_array = data.get("productDatasetCategoryArrayText", [])
if not isinstance(product_dataset_category_array, list):
product_dataset_category_array = []
mime_type_array = data.get("mimeTypeIdentifierArrayText", [])
if not isinstance(mime_type_array, list):
mime_type_array = []
# Parse product file bag #TODO: this does not seem to be available in search responses.
product_file_bag_data = data.get("productFileBag")
product_file_bag = (
ProductFileBag.from_dict(
product_file_bag_data,
product_identifier=data.get("productIdentifier", ""),
include_raw_data=include_raw_data,
)
if product_file_bag_data and isinstance(product_file_bag_data, dict)
else None
)
return cls(
product_identifier=data.get("productIdentifier", ""),
product_description_text=data.get("productDescriptionText", ""),
product_title_text=data.get("productTitleText", ""),
product_frequency_text=data.get("productFrequencyText", ""),
days_of_week_text=data.get("daysOfWeekText"),
product_label_array_text=product_label_array,
product_dataset_array_text=product_dataset_array,
product_dataset_category_array_text=product_dataset_category_array,
product_from_date=parse_to_date(data.get("productFromDate")),
product_to_date=parse_to_date(data.get("productToDate")),
product_total_file_size=data.get("productTotalFileSize", 0),
product_file_total_quantity=data.get("productFileTotalQuantity", 0),
last_modified_date_time=parse_to_datetime_utc(
data.get("lastModifiedDateTime")
),
mime_type_identifier_array_text=mime_type_array,
product_file_bag=product_file_bag,
raw_data=json.dumps(data) if include_raw_data else None,
)
[docs]
def to_dict(self) -> dict[str, Any]:
"""Convert the BulkDataProduct object to a dictionary.
Returns:
Dict[str, Any]: Dictionary representation with camelCase keys.
"""
d = {
"productIdentifier": self.product_identifier,
"productDescriptionText": self.product_description_text,
"productTitleText": self.product_title_text,
"productFrequencyText": self.product_frequency_text,
"daysOfWeekText": self.days_of_week_text,
"productLabelArrayText": self.product_label_array_text,
"productDatasetArrayText": self.product_dataset_array_text,
"productDatasetCategoryArrayText": self.product_dataset_category_array_text,
"productFromDate": serialize_date(self.product_from_date),
"productToDate": serialize_date(self.product_to_date),
"productTotalFileSize": self.product_total_file_size,
"productFileTotalQuantity": self.product_file_total_quantity,
"lastModifiedDateTime": (
serialize_datetime_as_naive(self.last_modified_date_time)
if self.last_modified_date_time
else None
),
"mimeTypeIdentifierArrayText": self.mime_type_identifier_array_text,
"productFileBag": (
self.product_file_bag.to_dict() if self.product_file_bag else None
),
}
return {
k: v
for k, v in d.items()
if v is not None and (not isinstance(v, list) or v)
}
[docs]
@dataclass(frozen=True)
class BulkDataResponse:
"""Top-level response from the bulk data API.
Attributes:
count: The number of bulk data products in the response.
bulk_data_product_bag: List of bulk data products.
raw_data: Optional raw JSON data from the API response (for debugging).
"""
count: int
bulk_data_product_bag: list[BulkDataProduct] = field(default_factory=list)
raw_data: str | None = field(default=None, compare=False, repr=False)
[docs]
@classmethod
def from_dict(
cls, data: dict[str, Any], include_raw_data: bool = False
) -> "BulkDataResponse":
"""Create a BulkDataResponse object from a dictionary.
Args:
data: Dictionary containing API response data.
include_raw_data: If True, store the raw JSON for debugging and
propagate to nested models.
Returns:
BulkDataResponse: An instance of BulkDataResponse.
"""
# Defensive parsing for bulk_data_product_bag
products_data = data.get("bulkDataProductBag", [])
products = (
[
BulkDataProduct.from_dict(product, include_raw_data=include_raw_data)
for product in products_data
if isinstance(product, dict)
]
if isinstance(products_data, list)
else []
)
return cls(
count=data.get("count", 0),
bulk_data_product_bag=products,
raw_data=json.dumps(data) if include_raw_data else None,
)
[docs]
def to_dict(self) -> dict[str, Any]:
"""Convert the BulkDataResponse object to a dictionary.
Returns:
Dict[str, Any]: Dictionary representation with camelCase keys.
"""
d = {
"count": self.count,
"bulkDataProductBag": [
product.to_dict() for product in self.bulk_data_product_bag
],
}
return {
k: v
for k, v in d.items()
if v is not None and (not isinstance(v, list) or v)
}