"""
ClinVar SQLite Schema Generator.
This module provides schema generation for ClinVar databases:
- RCV (Reference ClinVar Assertion) - condition-centric format
- VCV (Variant Call Variation) - variant-centric format
The SQL schemas are based on the following XSD:
- RCV: ClinVar_RCV_weekly.xsd v2.2 (August 6, 2025)
- VCV: ClinVar_VCV.xsd v2.5 (August 6, 2025)
Notes
-----
This module creates only the database schema structure (empty tables).
To populate databases with actual ClinVar data, use clinvar_parser.py.
"""
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# imports
import os
import sys
import sqlite3
import logging
import argparse
import clinvar_build.utils.config_tools as cnf_tools
from pathlib import Path
from typing import Iterator
from clinvar_build.errors import (
is_type,
)
from clinvar_build.utils.general import (
_check_directory,
_check_directory_readable,
)
from clinvar_build.utils.parser_tools import (
configure_logging,
)
from clinvar_build.constants import (
DBNames,
SchemaNames,
)
from contextlib import contextmanager
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# initiating a logger
logger = logging.getLogger(__name__)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs]
class ClinVarSchemaGenerator(object):
"""
Builder for ClinVar SQLite database schemas.
This class provides methods to create database schemas for
ClinVar RCV (condition-centric), and VCV (variant-centric),
It handles connecting to the SQLite database, executing table and index
creation, and safely closing the connection.
Attributes
----------
conn : `sqlite3.Connection` or `None`
Active SQLite connection, or None if not connected.
cursor : `sqlite3.Cursor` or `None`
Cursor for executing SQL statements, or None if not connected.
Methods
-------
create_schema(db_path, db_config, db_indices=None, name=None)
Create a SQLite schema for a given ClinVar database.
"""
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def __init__(self):
"""Initialise the schema builder."""
self.conn = None
self.cursor = None
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def __repr__(self) -> str:
"""
Return unambiguous string representation.
Returns
-------
str
String representation suitable for debugging
"""
connection_status = (
f"connected to {self.conn}"
if self.conn
else "not connected"
)
return f"{type(self).__name__}({connection_status})"
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def __str__(self) -> str:
"""
Return human-readable string representation.
Returns
-------
str
Human-readable description of the builder
"""
if self.conn:
return (
f"{type(self).__name__} (active connection)"
)
return (
f"{type(self).__name__}\n"
"Available methods:\n"
" - create_schema(db_path, db_config, db_indices, name): "
"Create database schema"
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _connect(self, db_path: str | Path) -> None:
"""
Establish database connection.
Parameters
----------
db_path : `str`
Path to SQLite database file
"""
self.conn = sqlite3.connect(db_path)
self.cursor = self.conn.cursor()
self.cursor.execute("PRAGMA foreign_keys = ON")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _close(self) -> None:
"""Close database connection."""
if self.conn:
self.conn.commit()
self.conn.close()
self.conn = None
self.cursor = None
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@contextmanager
def _connection(self, db_path: str | Path) -> Iterator[None]:
"""
Context manager for SQLite database connection.
Establishes a connection to the SQLite database at `db_path`
and ensures the connection is properly closed after use, even
if an exception occurs during operations.
Parameters
----------
db_path : `str` or `Path`
Path to the SQLite database file.
Yields
------
Provides a context in which the database connection is active.
`self.conn` and `self.cursor` are available for executing SQL
statements within the context.
"""
self._connect(db_path)
try:
yield
finally:
self._close()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def create_schema(self, db_path: str | Path, db_config: dict[str, str],
db_indices: dict[str, str] | None,
name: str | None = None,
) -> None:
"""
Create a SQLite schema for ClinVar (RCV, VCV).
Parameters
----------
db_path : `str` or `Path`
Path to SQLite database file.
db_config : `dict` [`str`, `str`]
Dictionary with SQL CREATE TABLE statements.
db_indices : `dict` [`str`, `str`] or `None`, default `None`
Dictionary with SQL CREATE INDEX statements.
name : `str` or `None`, default `None`
Optional name for logging. Defaults to the filename stem of db_path.
Returns
-------
None
"""
is_type(db_config, dict)
is_type(db_indices, (type(None), dict))
is_type(name, (type(None), str))
is_type(db_path, (Path, str))
# extract the filename from the db_path.
if name is None:
name = Path(db_path).stem
# Initiate logger nad connecdt databse
logger.info(f"Creating {name} schema: {db_path}")
with self._connection(db_path):
for stmt, columns in db_config.items():
sql = f"{stmt} {columns}"
sql = " ".join(sql.split())
logger.debug(f"Adding tables: {sql}")
self.cursor.execute(sql)
# self.cursor.execute(f"{stmt} {columns}")
if db_indices:
for stmt, on_clause in db_indices.items():
logger.debug(f"Adding indices: {stmt} {on_clause}")
self.cursor.execute(f"{stmt} {on_clause}")
logger.info(f"{name} schema created successfully: {db_path}")
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs]
def parse_arguments():
"""
Parse command-line arguments for schema generation.
Returns
-------
argparse.Namespace
Parsed command-line arguments containing directory,
rcv, and vcv attributes.
"""
parser = argparse.ArgumentParser(
description='Generate ClinVar SQLite database schemas'
)
parser.add_argument(
'directory',
type=str,
help='Directory path where database files will be created'
)
parser.add_argument(
'--rcv',
action='store_true',
help='Create RCV schema'
)
parser.add_argument(
'--vcv',
action='store_true',
help='Create VCV schema'
)
parser.add_argument(
'-v', '--verbose',
action='count',
default=0,
help='Increase verbosity (-v for INFO, -vv for DEBUG)'
)
# return
return parser.parse_args()
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs]
def main():
"""
Command-line interface for schema generation.
Examples
--------
# Create RCV schema
python clinvar_schema.py /data --rcv
# Create create RCV and VCV schemas
python clinvar_schema.py /data --rcv --vcv
"""
args = parse_arguments()
# Configure logging based on verbosity
configure_logging(args.verbose)
# checking path
_check_directory(args.directory)
_check_directory_readable(args.directory)
# add file names
file_rcv = os.path.join(args.directory, DBNames.rcv)
file_vcv = os.path.join(args.directory, DBNames.vcv)
# read config file
cnfpath = os.path.join(cnf_tools.check_environ(), 'schema')
# initialise builder
builder = ClinVarSchemaGenerator()
if args.rcv:
logger.info('Parsing RCV configuration')
rcv_config = cnf_tools.BlockConfigParser(
os.path.join(cnfpath, SchemaNames.rcv))
rcv_tables = rcv_config(SchemaNames.tables).parsed_data
rcv_indices = rcv_config(SchemaNames.indices).parsed_data
# create schema
builder.create_schema(db_path=file_rcv, db_config=rcv_tables,
db_indices=rcv_indices,
)
if args.vcv:
logger.info('Parsing VCV configuration')
vcv_config = cnf_tools.BlockConfigParser(
os.path.join(cnfpath, SchemaNames.vcv))
vcv_tables = vcv_config(SchemaNames.tables).parsed_data
vcv_indices = vcv_config(SchemaNames.indices).parsed_data
# create schema
builder.create_schema(db_path=file_vcv, db_config=vcv_tables,
db_indices=vcv_indices,
)
# Finished
logger.info("Schema generation completed successfully")
# Print summary for user
created_schemas = [
(name, path)
for flag, name, path in [
(args.rcv, 'RCV', file_rcv),
(args.vcv, 'VCV', file_vcv),
]
if flag
]
print("\n" + "="*70)
print("SCHEMA GENERATION SUMMARY")
print("="*70)
print(f"Output Directory: {args.directory}")
print(f"\nCreated Schemas ({len(created_schemas)}):")
for schema_name, schema_path in created_schemas:
print(f" ✓ {schema_name:12s} → {schema_path}")
print("="*70)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
if __name__ == "__main__":
try:
main()
sys.exit(0)
except KeyboardInterrupt:
print("\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)