Source code for clinvar_build.schema_generator

"""
ClinVar SQLite Schema Generator.

This module provides schema generation for ClinVar databases:
- RCV (Reference ClinVar Assertion) - condition-centric format
- VCV (Variant Call Variation) - variant-centric format

The SQL schemas are based on the following XSD:
- RCV: ClinVar_RCV_weekly.xsd v2.2 (August 6, 2025)
- VCV: ClinVar_VCV.xsd v2.5 (August 6, 2025)

Notes
-----
This module creates only the database schema structure (empty tables).
To populate databases with actual ClinVar data, use clinvar_parser.py.
"""

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# imports
import os
import sys
import sqlite3
import logging
import argparse
import clinvar_build.utils.config_tools as cnf_tools
from pathlib import Path
from typing import Iterator
from clinvar_build.errors import (
    is_type,
)
from clinvar_build.utils.general import (
    _check_directory,
    _check_directory_readable,
)
from clinvar_build.utils.parser_tools import (
    configure_logging,
)
from clinvar_build.constants import (
    DBNames,
    SchemaNames,
)
from contextlib import contextmanager

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# initiating a logger
logger = logging.getLogger(__name__)

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs] class ClinVarSchemaGenerator(object): """ Builder for ClinVar SQLite database schemas. This class provides methods to create database schemas for ClinVar RCV (condition-centric), and VCV (variant-centric), It handles connecting to the SQLite database, executing table and index creation, and safely closing the connection. Attributes ---------- conn : `sqlite3.Connection` or `None` Active SQLite connection, or None if not connected. cursor : `sqlite3.Cursor` or `None` Cursor for executing SQL statements, or None if not connected. Methods ------- create_schema(db_path, db_config, db_indices=None, name=None) Create a SQLite schema for a given ClinVar database. """ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def __init__(self): """Initialise the schema builder.""" self.conn = None self.cursor = None
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def __repr__(self) -> str: """ Return unambiguous string representation. Returns ------- str String representation suitable for debugging """ connection_status = ( f"connected to {self.conn}" if self.conn else "not connected" ) return f"{type(self).__name__}({connection_status})"
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def __str__(self) -> str: """ Return human-readable string representation. Returns ------- str Human-readable description of the builder """ if self.conn: return ( f"{type(self).__name__} (active connection)" ) return ( f"{type(self).__name__}\n" "Available methods:\n" " - create_schema(db_path, db_config, db_indices, name): " "Create database schema" )
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def _connect(self, db_path: str | Path) -> None: """ Establish database connection. Parameters ---------- db_path : `str` Path to SQLite database file """ self.conn = sqlite3.connect(db_path) self.cursor = self.conn.cursor() self.cursor.execute("PRAGMA foreign_keys = ON") # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def _close(self) -> None: """Close database connection.""" if self.conn: self.conn.commit() self.conn.close() self.conn = None self.cursor = None # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @contextmanager def _connection(self, db_path: str | Path) -> Iterator[None]: """ Context manager for SQLite database connection. Establishes a connection to the SQLite database at `db_path` and ensures the connection is properly closed after use, even if an exception occurs during operations. Parameters ---------- db_path : `str` or `Path` Path to the SQLite database file. Yields ------ Provides a context in which the database connection is active. `self.conn` and `self.cursor` are available for executing SQL statements within the context. """ self._connect(db_path) try: yield finally: self._close() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def create_schema(self, db_path: str | Path, db_config: dict[str, str], db_indices: dict[str, str] | None, name: str | None = None, ) -> None: """ Create a SQLite schema for ClinVar (RCV, VCV). Parameters ---------- db_path : `str` or `Path` Path to SQLite database file. db_config : `dict` [`str`, `str`] Dictionary with SQL CREATE TABLE statements. db_indices : `dict` [`str`, `str`] or `None`, default `None` Dictionary with SQL CREATE INDEX statements. name : `str` or `None`, default `None` Optional name for logging. Defaults to the filename stem of db_path. Returns ------- None """ is_type(db_config, dict) is_type(db_indices, (type(None), dict)) is_type(name, (type(None), str)) is_type(db_path, (Path, str)) # extract the filename from the db_path. if name is None: name = Path(db_path).stem # Initiate logger nad connecdt databse logger.info(f"Creating {name} schema: {db_path}") with self._connection(db_path): for stmt, columns in db_config.items(): sql = f"{stmt} {columns}" sql = " ".join(sql.split()) logger.debug(f"Adding tables: {sql}") self.cursor.execute(sql) # self.cursor.execute(f"{stmt} {columns}") if db_indices: for stmt, on_clause in db_indices.items(): logger.debug(f"Adding indices: {stmt} {on_clause}") self.cursor.execute(f"{stmt} {on_clause}") logger.info(f"{name} schema created successfully: {db_path}")
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs] def parse_arguments(): """ Parse command-line arguments for schema generation. Returns ------- argparse.Namespace Parsed command-line arguments containing directory, rcv, and vcv attributes. """ parser = argparse.ArgumentParser( description='Generate ClinVar SQLite database schemas' ) parser.add_argument( 'directory', type=str, help='Directory path where database files will be created' ) parser.add_argument( '--rcv', action='store_true', help='Create RCV schema' ) parser.add_argument( '--vcv', action='store_true', help='Create VCV schema' ) parser.add_argument( '-v', '--verbose', action='count', default=0, help='Increase verbosity (-v for INFO, -vv for DEBUG)' ) # return return parser.parse_args()
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs] def main(): """ Command-line interface for schema generation. Examples -------- # Create RCV schema python clinvar_schema.py /data --rcv # Create create RCV and VCV schemas python clinvar_schema.py /data --rcv --vcv """ args = parse_arguments() # Configure logging based on verbosity configure_logging(args.verbose) # checking path _check_directory(args.directory) _check_directory_readable(args.directory) # add file names file_rcv = os.path.join(args.directory, DBNames.rcv) file_vcv = os.path.join(args.directory, DBNames.vcv) # read config file cnfpath = os.path.join(cnf_tools.check_environ(), 'schema') # initialise builder builder = ClinVarSchemaGenerator() if args.rcv: logger.info('Parsing RCV configuration') rcv_config = cnf_tools.BlockConfigParser( os.path.join(cnfpath, SchemaNames.rcv)) rcv_tables = rcv_config(SchemaNames.tables).parsed_data rcv_indices = rcv_config(SchemaNames.indices).parsed_data # create schema builder.create_schema(db_path=file_rcv, db_config=rcv_tables, db_indices=rcv_indices, ) if args.vcv: logger.info('Parsing VCV configuration') vcv_config = cnf_tools.BlockConfigParser( os.path.join(cnfpath, SchemaNames.vcv)) vcv_tables = vcv_config(SchemaNames.tables).parsed_data vcv_indices = vcv_config(SchemaNames.indices).parsed_data # create schema builder.create_schema(db_path=file_vcv, db_config=vcv_tables, db_indices=vcv_indices, ) # Finished logger.info("Schema generation completed successfully") # Print summary for user created_schemas = [ (name, path) for flag, name, path in [ (args.rcv, 'RCV', file_rcv), (args.vcv, 'VCV', file_vcv), ] if flag ] print("\n" + "="*70) print("SCHEMA GENERATION SUMMARY") print("="*70) print(f"Output Directory: {args.directory}") print(f"\nCreated Schemas ({len(created_schemas)}):") for schema_name, schema_path in created_schemas: print(f" ✓ {schema_name:12s}{schema_path}") print("="*70)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ if __name__ == "__main__": try: main() sys.exit(0) except KeyboardInterrupt: print("\nInterrupted by user") sys.exit(1) except Exception as e: print(f"Error: {e}") sys.exit(1)