Genomic_data_QSVM/qsvm.py

399 lines
13 KiB
Python

import numpy as np
import pandas as pd
import os
import sys
import time
from datetime import datetime
import json
import gc
import signal
import multiprocessing
from multiprocessing import Pool
# Import bibliotek kwantowych
from qiskit import Aer
from qiskit.circuit.library import ZZFeatureMap, PauliFeatureMap, EfficientSU2
from qiskit_machine_learning.kernels import QuantumKernel
from qiskit_machine_learning.algorithms import QSVC
import dimod
# Dodanie zalecanego zamiennika dla ZZFeatureMap
from qiskit.circuit import QuantumCircuit, Parameter
from qiskit.circuit.library.data_preparation import ZFeatureMap
# Dodanie bibliotek do kodowania amplitudowego
from qiskit.circuit import QuantumCircuit
from qiskit.extensions import Initialize
import scipy.linalg as la
# Dodanie biblioteki UMAP
import umap
# Maksymalny czas trwania eksperymentu
MAX_EXECUTION_TIME = 24 * 60 * 60 * 7 * 3
def timeout_handler(signum, frame):
print("\n\n======= PRZEKROCZONO MAKSYMALNY CZAS WYKONANIA =======")
print(f"Eksperyment został przerwany po {MAX_EXECUTION_TIME/3600:.1f} godzinach.")
sys.exit(1)
# Ustawienie obsługi sygnału
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(MAX_EXECUTION_TIME)
# ----------------- CZĘŚĆ 0: PARAMETRY KONFIGURACYJNE -----------------
# Parametry danych
DATA_FILES = [
'dane/TCGA_GBM_LGG_Mutations_all.csv',
'dane/zaszumione_rozszerzone/TCGA_GBM_LGG_Mutations_noise_1percent_added.csv',
'dane/zaszumione_rozszerzone/TCGA_GBM_LGG_Mutations_noise_5percent_added.csv',
'dane/zaszumione_rozszerzone/TCGA_GBM_LGG_Mutations_noise_10percent_added.csv',
'dane/zaszumione_rozszerzone/TCGA_GBM_LGG_Mutations_noise_15percent_added.csv',
'dane/zaszumione_rozszerzone/TCGA_GBM_LGG_Mutations_noise_20percent_added.csv',
'dane/zaszumione/TCGA_GBM_LGG_Mutations_noise_1percent_substituted.csv',
'dane/zaszumione/TCGA_GBM_LGG_Mutations_noise_5percent_substituted.csv',
'dane/zaszumione/TCGA_GBM_LGG_Mutations_noise_10percent_substituted.csv',
'dane/zaszumione/TCGA_GBM_LGG_Mutations_noise_15percent_substituted.csv',
'dane/zaszumione/TCGA_GBM_LGG_Mutations_noise_20percent_substituted.csv'
]
TEST_SIZE = 0.3
RANDOM_STATE = 42
# Parametry redukcji wymiarowości
USE_PCA = True
USE_TSNE = False
USE_UMAP = False
PCA_COMPONENTS = 14
TSNE_COMPONENTS = 3
TSNE_PERPLEXITY = 100
TSNE_LEARNING_RATE = 50
TSNE_MAX_ITER = 1000
UMAP_COMPONENTS = 14
UMAP_NEIGHBORS = 15
UMAP_MIN_DIST = 0.8
UMAP_METRIC = 'euclidean'
EVALUATE_SILHOUETTE = False
OPTIMAL_SILHOUETTE_SCORE = 0.1964
# Wybór eksperymentów do przeprowadzenia
RUN_CLASSIC_SVM = True
RUN_QUANTUM_SVM = True
RUN_HYBRID_APPROACH = True
# Parametry klasycznego SVM
SVM_PARAM_GRID = {
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 'auto', 0.1, 0.01],
'kernel': ['linear', 'rbf', 'poly']
}
SVM_CV = 5
# Parametry kwantowego SVM
BACKEND_NAME = 'qasm_simulator'
C_VALUES = [0.1, 1.0, 10.0]
QSVM_CV = 10
# Parametry wyżarzania kwantowego
NUM_READS = 100
QUBO_PENALTY = 10.0
# Parametry analizy cech
IMPORTANCE_THRESHOLD = 0.01
# Parametry wyjściowe
OUTPUT_DIR = f'wyniki/2025-08-04-dim_reduction-{QSVM_CV}-fold'
# Parametry IBM Quantum Cloud
USE_IBM_QUANTUM = False
IBM_BACKEND = 'qasm_simulator'
IBM_REAL_BACKEND = 'qasm_simulator'
IBM_TOKEN = None
IBM_INSTANCE = None
IBM_MAX_SHOTS = 1024
IBM_OPTIMIZATION_LEVEL = 1
IBM_RESILIENCE_LEVEL = 1
# Upewnij się, że katalog wyjściowy istnieje
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
# Przekierowanie wyjścia do pliku i konsoli
class Logger:
def __init__(self, filename):
self.terminal = sys.stdout
self.log = open(filename, 'w', encoding='utf-8')
def write(self, message):
self.terminal.write(message)
self.log.write(message)
self.log.flush()
def flush(self):
self.terminal.flush()
self.log.flush()
def close(self):
self.log.close()
# Funkcja do zapisywania szczegółowych metryk
def save_metrics(y_true, y_pred, model_name):
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
try:
roc_auc = roc_auc_score(y_true, y_pred)
except:
roc_auc = "N/A"
print(f"\nSzczegółowe metryki dla modelu {model_name}:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc}")
cm = confusion_matrix(y_true, y_pred)
print("\nMacierz pomyłek:")
print(cm)
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
'roc_auc': roc_auc,
'confusion_matrix': cm.tolist()
}
# Funkcja do zapisywania wyników pośrednich
def save_results_cache(results_dict, cache_file):
with open(cache_file, 'w') as f:
json.dump(results_dict, f)
# Funkcja do wczytywania wyników pośrednich
def load_results_cache(cache_file):
if os.path.exists(cache_file):
try:
with open(cache_file, 'r') as f:
return json.load(f)
except:
print("Nie udało się wczytać pliku cache. Tworzenie nowego.")
return {
'quantum_results': [],
'quantum_times': {},
'completed_feature_maps': [],
'hybrid_scores': {},
'hybrid_eval_times': {}
}
# Funkcja do inicjalizacji lokalnego symulatora
def initialize_ibm_quantum():
print("\n======= INICJALIZACJA LOKALNEGO SYMULATORA =======")
print("IBM Quantum Cloud wyłączone - używanie lokalnego symulatora")
try:
backend = Aer.get_backend('qasm_simulator')
print("✓ Zainicjalizowano lokalny symulator Qiskit Aer")
print(f"✓ Backend: {backend.name}")
return None, backend, True
except Exception as e:
print(f"BŁĄD podczas inicjalizacji lokalnego symulatora: {str(e)}")
return None, None, False
# Funkcja do przygotowania danych
def prepare_data(data_file):
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
start_time_data = time.time()
# Wczytanie danych
data = pd.read_csv(data_file)
# Wydzielenie zmiennej docelowej
y = data['Primary_Diagnosis']
# Usunięcie kolumn identyfikacyjnych
id_columns = ['Project', 'Case_ID']
data_processed = data.drop(id_columns + ['Grade'], axis=1)
# Przekształcenie kolumn kategorycznych na binarne
categorical_columns = data_processed.select_dtypes(include=['object']).columns.tolist()
# Przekształcenie każdej kolumny kategorycznej
for col in categorical_columns:
unique_values = data_processed[col].unique()
if len(unique_values) == 2:
if set(unique_values) == {'Yes', 'No'}:
data_processed[col] = data_processed[col].map({'Yes': 1, 'No': 0})
elif set(unique_values) == {'Male', 'Female'}:
data_processed[col] = data_processed[col].map({'Male': 1, 'Female': 0})
elif set(unique_values) == {'GBM', 'LGG'}:
data_processed[col] = data_processed[col].map({'GBM': 1, 'LGG': 0})
else:
data_processed[col] = pd.factorize(data_processed[col])[0]
else:
try:
data_processed[col] = pd.to_numeric(data_processed[col], errors='raise')
except:
dummies = pd.get_dummies(data_processed[col], prefix=col, drop_first=True)
data_processed = pd.concat([data_processed, dummies], axis=1)
data_processed.drop(col, axis=1, inplace=True)
# Sprawdzenie, czy wszystkie dane są numeryczne
non_numeric = data_processed.select_dtypes(include=['object']).columns.tolist()
if non_numeric:
data_processed = data_processed.drop(non_numeric, axis=1)
# Wypełnienie brakujących wartości
if data_processed.isnull().sum().sum() > 0:
data_processed.fillna(data_processed.mean(), inplace=True)
# Przygotowanie danych do modelowania
X = data_processed.values
# Przetwarzanie cech
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Redukcja wymiarowości
# Przygotowanie różnych wersji zredukowanych danych
X_reduced_versions = {}
if USE_PCA:
pca_components = min(PCA_COMPONENTS, X_scaled.shape[1])
pca = PCA(n_components=pca_components)
X_reduced_pca = pca.fit_transform(X_scaled)
X_reduced_versions['pca'] = X_reduced_pca
if USE_TSNE:
tsne = TSNE(
n_components=TSNE_COMPONENTS,
perplexity=TSNE_PERPLEXITY,
learning_rate=TSNE_LEARNING_RATE,
n_iter=TSNE_MAX_ITER,
random_state=RANDOM_STATE
)
tsne_start_time = time.time()
X_reduced_tsne = tsne.fit_transform(X_scaled)
tsne_end_time = time.time()
tsne_time = tsne_end_time - tsne_start_time
if EVALUATE_SILHOUETTE:
try:
silhouette_avg = silhouette_score(X_reduced_tsne, y)
except Exception as e:
pass
X_reduced_versions['tsne'] = X_reduced_tsne
if USE_UMAP:
umap_reducer = umap.UMAP(
n_components=UMAP_COMPONENTS,
n_neighbors=UMAP_NEIGHBORS,
min_dist=UMAP_MIN_DIST,
metric=UMAP_METRIC,
random_state=RANDOM_STATE
)
umap_start_time = time.time()
X_reduced_umap = umap_reducer.fit_transform(X_scaled)
umap_end_time = time.time()
umap_time = umap_end_time - umap_start_time
if EVALUATE_SILHOUETTE:
try:
silhouette_avg_umap = silhouette_score(X_reduced_umap, y)
except Exception as e:
pass
X_reduced_versions['umap'] = X_reduced_umap
# Wybór domyślnej wersji zredukowanych danych
if 'pca' in X_reduced_versions:
X_reduced = X_reduced_versions['pca']
elif 'umap' in X_reduced_versions:
X_reduced = X_reduced_versions['umap']
elif 'tsne' in X_reduced_versions:
X_reduced = X_reduced_versions['tsne']
else:
X_reduced = X_scaled
# Podział na zbiory treningowy i testowy
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(
X_reduced, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
# Podział dla oryginalnych danych
X_train, X_test, _, _ = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
end_time_data = time.time()
data_preparation_time = end_time_data - start_time_data
return {
'X_train': X_train,
'X_test': X_test,
'X_train_reduced': X_train_reduced,
'X_test_reduced': X_test_reduced,
'y_train': y_train,
'y_test': y_test,
'data_processed': data_processed,
'preparation_time': data_preparation_time
}
def run_experiment_parallel(exp_file):
"""Uruchom pojedynczy eksperyment"""
if exp_file == 'qsvm1_zz.py':
import qsvm1_zz
return qsvm1_zz.run_experiment()
elif exp_file == 'qsvm2_pauli.py':
import qsvm2_pauli
return qsvm2_pauli.run_experiment()
elif exp_file == 'qsvm3_z.py':
import qsvm3_z
return qsvm3_z.run_experiment()
elif exp_file == 'qsvm4_amplitude.py':
import qsvm4_amplitude
return qsvm4_amplitude.run_experiment()
elif exp_file == 'qsvm5_hybrid.py':
import qsvm5_hybrid
return qsvm5_hybrid.run_experiment()
else:
print(f"Nieznany eksperyment: {exp_file}")
return None
def run_all_experiments_parallel():
"""Uruchom wszystkie eksperymenty równolegle"""
experiment_files = [
'qsvm1_zz.py',
'qsvm2_pauli.py',
'qsvm3_z.py',
'qsvm4_amplitude.py',
'qsvm5_hybrid.py'
]
# Użyj 5 procesów (jeden na eksperyment)
with Pool(5) as p:
results = p.map(run_experiment_parallel, experiment_files)
return results
if __name__ == "__main__":
print("======= KONTROLER EKSPERYMENTÓW QSVM =======")
print(f"CPU cores: {multiprocessing.cpu_count()}")
print(f"Uruchamianie 5 eksperymentów równolegle...")
results = run_all_experiments_parallel()
print("======= WSZYSTKIE EKSPERYMENTY ZAKOŃCZONE =======")