1. Soft-Scoring über Textähnlichkeit

• Neue Funktionen build_lexicons(...) und compute_soft_system_scores(...) (TF-IDF + Cosine). • Erzeugt Psych_Score und Sozial_Score im Bereich 0..1. • Quelle der Lexika: zuerst system_mapping.csv (falls vorhanden), dann bereits klassifizierte Stichwörter, sonst konservative Heuristik. 2. Neuer Kopplungsindex • compute_coupling_index(...) nutzt jetzt Soft-Scores. • Kopplungsgewicht = harmonisches Mittel H(p,s) = 2ps/(p+s). • Index = sign(d) * norm(|d|) * H. • Adressierungslabel basieren auf den Soft-Scores (Schwelle 0.5). 3. Plots umgestellt • 2D-Plot nutzt Psych_Score (x) und Sozial_Score (y), Achsen 0..1 mit Ticks 0, .25, .5, .75, 1. • 3D-Plot nutzt Soft-Scores auf x/y, d auf z. • Farben/Marker weiter strikt aus dem CI-Template. 4. Pipeline • Nach classify_systems(...) wird compute_soft_system_scores(...) aufgerufen. • Export erweitert um Psych_Score und Sozial_Score. • Kleine Kurzdiagnostik: Quartile der Soft-Scores im Terminal.
2025-09-03 21:43:54 +02:00
parent 55812bc5d6
commit 97c15bf209
1 changed files with 141 additions and 33 deletions
--- a/systemtheoretisch.py
+++ b/systemtheoretisch.py
@ -192,6 +192,85 @@ def classify_systems(df: pd.DataFrame, mapping: pd.DataFrame | None = None) -> p
    return out
 # -----------------------------------------
 # Soft Scores via Textähnlichkeit (TF-IDF + Cosine)
 # -----------------------------------------
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 def _normalize_01(a: np.ndarray) -> np.ndarray:
    a = np.asarray(a, dtype=float)
    if a.size == 0:
        return a
    lo, hi = np.nanmin(a), np.nanmax(a)
    if not np.isfinite(lo) or not np.isfinite(hi) or hi - lo <= 1e-12:
        return np.zeros_like(a, dtype=float)
    return (a - lo) / (hi - lo)
 def build_lexicons(df: pd.DataFrame, mapping: pd.DataFrame | None) -> tuple[str, str]:
    """
    Erzeugt zwei 'Pseudodokumente' (Lexika) für psychische vs. soziale Marker.
    Präferenz: Mapping → bereits klassifizierte Stichwörter → konservative Heuristik.
    """
    # 1) Aus Mapping (explizite Terme)
    psych_terms, sozial_terms = [], []
    if mapping is not None and not mapping.empty:
        psych_terms = mapping.loc[mapping["Psych"] == 1, "Term"].astype(str).tolist()
        sozial_terms = mapping.loc[mapping["Sozial"] == 1, "Term"].astype(str).tolist()
    # 2) Ergänzen durch bereits klassifizierte Stichwörter
    if "Psych" in df.columns and "Sozial" in df.columns:
        psych_terms += df.loc[df["Psych"] == 1, "Stichwort"].astype(str).tolist()
        sozial_terms += df.loc[df["Sozial"] == 1, "Stichwort"].astype(str).tolist()
    # 3) Fallback-Heuristik
    if not psych_terms:
        psych_terms = [
            "Intelligenz","Kognition","Exekutive Funktionen","Gedächtnis","Selbstwirksamkeit",
            "Selbstbild","Emotion","Motivation","Ausdauer","Beharrlichkeit","Zuversicht",
            "Mindset","Kreativität","Neugier","Arbeitsgedächtnis","Einstellung","Wille"
        ]
    if not sozial_terms:
        sozial_terms = [
            "Klasse","Beziehung","Lehrer","Schüler","Unterricht","Klima","Team","Gruppe",
            "Beratung","Schulleitung","Schule","Familie","Eltern","Zusammenhalt",
            "Zugehörigkeit","Curriculum","Stundenplan","Pause","Konflikt","Sommerschule"
        ]
    # Als Pseudodokumente zusammenfassen
    doc_psych = " ".join(map(str, psych_terms))
    doc_sozial = " ".join(map(str, sozial_terms))
    return doc_psych, doc_sozial
 def compute_soft_system_scores(df: pd.DataFrame, mapping: pd.DataFrame | None) -> pd.DataFrame:
    """
    Berechnet kontinuierliche Scores (0..1) für Psychisch/Sozial via TF-IDF + Cosine-Similarity
    zu zwei Pseudodokumenten (Lexika).
    """
    out = df.copy()
    # Pseudodokumente bauen
    doc_psych, doc_sozial = build_lexicons(out, mapping)
    # Korpus = alle Stichwörter + 2 Pseudodokumente
    corpus = out["Stichwort"].astype(str).tolist() + [doc_psych, doc_sozial]
    vect = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
    X = vect.fit_transform(corpus)
    # Indizes der Pseudodocs
    idx_psych = X.shape[0] - 2
    idx_sozial = X.shape[0] - 1
    # Cosine-Similarity jedes Stichworts zu den Pseudodocs
    S_psych = cosine_similarity(X[:-2], X[idx_psych])
    S_sozial = cosine_similarity(X[:-2], X[idx_sozial])
    # Auf [0,1] bringen (zeilenweise Vektoren → 1D)
    p_raw = S_psych.ravel()
    s_raw = S_sozial.ravel()
    out["Psych_Score"] = _normalize_01(p_raw)
    out["Sozial_Score"] = _normalize_01(s_raw)
    return out
 # -----------------------------------------
 # Kopplungsindex (Erziehungssystem: Lernfähig/nicht lernfähig)
 # -----------------------------------------
@ -206,38 +285,53 @@ def minmax_norm(a: np.ndarray) -> np.ndarray:
 def compute_coupling_index(df: pd.DataFrame) -> pd.DataFrame:
    """
-    Kopplungsindex = norm(|d|) * w
+    Kontinuierlicher Kopplungsindex:
-      w = 1.0 bei Kopplung (Psych=1 & Sozial=1)
+      - p = Psych_Score (0..1), s = Sozial_Score (0..1); Fallback auf binäre 'Psych'/'Sozial'
-          0.6 bei nur Psych=1 oder nur Sozial=1
+      - H = harmonisches Mittel = 2ps/(p+s) (0, wenn p+s=0)
-          0.2 sonst (unspezifisch)
+      - |d| min-max-normalisiert
-    Vorzeichen des Index = Vorzeichen(d)
+      - Index = sign(d) * norm(|d|) * H
    """
    out = df.copy()
    # Soft Scores oder Fallback
    if "Psych_Score" in out.columns and "Sozial_Score" in out.columns:
        p = out["Psych_Score"].astype(float).values
        s = out["Sozial_Score"].astype(float).values
    else:
        p = out.get("Psych", pd.Series(0, index=out.index)).astype(float).clip(0,1).values
        s = out.get("Sozial", pd.Series(0, index=out.index)).astype(float).clip(0,1).values
    # Harmonisches Mittel (numerisch stabil)
    denom = p + s
    H = np.zeros_like(denom, dtype=float)
    mask = denom > 0
    H[mask] = 2 * p[mask] * s[mask] / denom[mask]
    # |d| normalisieren
    abs_d = out["Effektstärke"].abs().values
-    abs_d_norm = minmax_norm(abs_d)
+    abs_d_norm = _normalize_01(abs_d)
-    both = (out["Psych"] == 1) & (out["Sozial"] == 1)
+    signed_index = np.sign(out["Effektstärke"].values) * abs_d_norm * H
-    single = ((out["Psych"] == 1) ^ (out["Sozial"] == 1))
+    out["Kopplungsindex"] = signed_index
    none = (out["Psych"] == 0) & (out["Sozial"] == 0)
-    w = np.where(both, 1.0, np.where(single, 0.6, 0.2))
+    # Adressierungslabel anhand Soft Scores
-    signed = np.sign(out["Effektstärke"].values) * abs_d_norm * w
+    def addr_lab(pp, ss):
-    out["Kopplungsindex"] = signed
+        if pp >= 0.5 and ss >= 0.5:
    # Label für schnelle Lesbarkeit
    def addr_label(p, s):
        if p == 1 and s == 1:
            return "Kopplung (Psych+Sozial)"
-        if p == 1 and s == 0:
+        if pp >= 0.5 and ss < 0.5:
            return "Psychisch adressiert"
-        if p == 0 and s == 1:
+        if pp < 0.5 and ss >= 0.5:
            return "Sozial adressiert"
        return "Unspezifisch"
-    out["Adressierung"] = [addr_label(p, s) for p, s in zip(out["Psych"], out["Sozial"])]
+
    # Für Labels Soft-Scores nutzen, falls vorhanden
    p_for_label = p
    s_for_label = s
    out["Adressierung"] = [addr_lab(pp, ss) for pp, ss in zip(p_for_label, s_for_label)]
    # Ränge
    out["Rank_abs_d"] = (-out["Effektstärke"].abs()).rank(method="min").astype(int)
-    out["Rank_kopplung"] = (-out["Kopplungsindex"].abs()).rank(method="min").astype(int)
+    out["Rank_kopplung"] = (-np.abs(out["Kopplungsindex"])).rank(method="min").astype(int)
    return out
 # -----------------------------------------
@ -247,8 +341,8 @@ def plot_sign_system_2d(df: pd.DataFrame):
    """
    2D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Markergröße|Farbe ~ Kopplungsindex
    """
-    x = df["Psych"].astype(int)
+    x = (df["Psych_Score"] if "Psych_Score" in df.columns else df["Psych"].astype(float).clip(0,1))
-    y = df["Sozial"].astype(int)
+    y = (df["Sozial_Score"] if "Sozial_Score" in df.columns else df["Sozial"].astype(float).clip(0,1))
    size = (df["Kopplungsindex"].abs() * 22.0 + 6.0).astype(float)
    color_pos = _colors.get("positiveHighlight", "#2ca02c")
@ -281,19 +375,19 @@ def plot_sign_system_2d(df: pd.DataFrame):
        name="Thermometer"
    ))
-    # Diskrete Achsen (0/1) mit CI-Layout
+    # Kontinuierliche Achsen (0..1) mit CI-Layout
    fig.update_layout(_layout(
        "Erziehungssystem – Adressierung & Kopplung (2D)",
-        "Psychisch (0/1)", "Sozial (0/1)"
+        "Psychisch (0..1)", "Sozial (0..1)"
    ))
-    fig.update_xaxes(tickmode="array", tickvals=[0, 1], ticktext=["0", "1"])
+    fig.update_xaxes(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
-    fig.update_yaxes(tickmode="array", tickvals=[0, 1], ticktext=["0", "1"])
+    fig.update_yaxes(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
    fig.show()
    export_figure(fig, "sys_erziehung_2d", export_fig_visual, export_fig_png)
 def plot_sign_system_3d(df: pd.DataFrame):
    """
-    3D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Z=Effektstärke; Farbe/Größe ~ Kopplungsindex
+    3D-Sicht: X=Psych_Score (0..1), Y=Sozial_Score (0..1), Z=Effektstärke; Farbe/Größe ~ Kopplungsindex
    """
    size = (df["Kopplungsindex"].abs() * 8.0 + 4.0).astype(float)
    color_pos = _colors.get("positiveHighlight", "#2ca02c")
@ -310,8 +404,8 @@ def plot_sign_system_3d(df: pd.DataFrame):
    fig = go.Figure()
    fig.add_trace(go.Scatter3d(
-        x=df["Psych"].astype(int),
+        x=(df["Psych_Score"] if "Psych_Score" in df.columns else df["Psych"].astype(float).clip(0,1)),
-        y=df["Sozial"].astype(int),
+        y=(df["Sozial_Score"] if "Sozial_Score" in df.columns else df["Sozial"].astype(float).clip(0,1)),
        z=df["Effektstärke"],
        mode="markers",
        marker={**_styles.get("marker_accent", {}), "size": size, "color": point_colors},
@ -323,11 +417,11 @@ def plot_sign_system_3d(df: pd.DataFrame):
    fig.update_layout(_layout(
        "Erziehungssystem – 3D-Sicht (Psych × Sozial × d)",
-        "Psychisch (0/1)", "Sozial (0/1)", "Cohen d"
+        "Psychisch (0..1)", "Sozial (0..1)", "Cohen d"
    ))
    fig.update_scenes(
-        xaxis=dict(tickmode="array", tickvals=[0,1], ticktext=["0","1"]),
+        xaxis=dict(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0]),
-        yaxis=dict(tickmode="array", tickvals=[0,1], ticktext=["0","1"])
+        yaxis=dict(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
    )
    fig.show()
    export_figure(fig, "sys_erziehung_3d", export_fig_visual, export_fig_png)
@ -373,17 +467,31 @@ def analyse_system(path_csv: str, map_csv: str = "system_mapping.csv"):
    mapping = load_system_mapping(map_csv)
    df = classify_systems(df, mapping=mapping)
    # Soft Scores aus Textähnlichkeit
    df = compute_soft_system_scores(df, mapping=mapping)
    # Kopplungsindex
    df = compute_coupling_index(df)
    # Export Kern-Output
    try:
-        out_cols = ["Thermometer_ID","Stichwort","Kapitel","Kapitelname","Effektstärke","Psych","Sozial","Adressierung","Kopplungsindex"]
+        out_cols = [
            "Thermometer_ID","Stichwort","Kapitel","Kapitelname","Effektstärke",
            "Psych","Sozial","Psych_Score","Sozial_Score",
            "Adressierung","Kopplungsindex"
        ]
        df[out_cols].to_csv(os.path.join(EXPORT_DIR, "system_view.csv"), index=False)
        export_json(df[out_cols].to_dict(orient="records"), "system_view.json")
    except Exception:
        pass
    # Kurzdiagnostik
    print("Soft-Score-Quartile (Psych, Sozial):")
    for col in ["Psych_Score","Sozial_Score"]:
        if col in df.columns:
            q = df[col].quantile([0.25,0.5,0.75]).round(3).to_dict()
            print(f"  {col}: q25={q.get(0.25)}, q50={q.get(0.5)}, q75={q.get(0.75)}")
    # Visualisierungen
    plot_sign_system_2d(df)
    plot_sign_system_3d(df)