From 97c15bf209a83edac5282801e4d6c5a59e655c50 Mon Sep 17 00:00:00 2001
From: Jochen Hanisch-Johannsen <kontakt@jochen-hanisch.de>
Date: Wed, 3 Sep 2025 21:43:54 +0200
Subject: [PATCH] =?UTF-8?q?1.=09Soft-Scoring=20=C3=BCber=20Text=C3=A4hnlic?=
 =?UTF-8?q?hkeit=20=09=E2=80=A2=09Neue=20Funktionen=20build=5Flexicons(...?=
 =?UTF-8?q?)=20und=20compute=5Fsoft=5Fsystem=5Fscores(...)=20(TF-IDF=20+?=
 =?UTF-8?q?=20Cosine).=20=09=E2=80=A2=09Erzeugt=20Psych=5FScore=20und=20So?=
 =?UTF-8?q?zial=5FScore=20im=20Bereich=200..1.=20=09=E2=80=A2=09Quelle=20d?=
 =?UTF-8?q?er=20Lexika:=20zuerst=20system=5Fmapping.csv=20(falls=20vorhand?=
 =?UTF-8?q?en),=20dann=20bereits=20klassifizierte=20Stichw=C3=B6rter,=20so?=
 =?UTF-8?q?nst=20konservative=20Heuristik.=20=092.=09Neuer=20Kopplungsinde?=
 =?UTF-8?q?x=20=09=E2=80=A2=09compute=5Fcoupling=5Findex(...)=20nutzt=20je?=
 =?UTF-8?q?tzt=20Soft-Scores.=20=09=E2=80=A2=09Kopplungsgewicht=20=3D=20ha?=
 =?UTF-8?q?rmonisches=20Mittel=20H(p,s)=20=3D=202ps/(p+s).=20=09=E2=80=A2?=
 =?UTF-8?q?=09Index=20=3D=20sign(d)=20*=20norm(|d|)=20*=20H.=20=09?=
 =?UTF-8?q?=E2=80=A2=09Adressierungslabel=20basieren=20auf=20den=20Soft-Sc?=
 =?UTF-8?q?ores=20(Schwelle=200.5).=20=093.=09Plots=20umgestellt=20=09?=
 =?UTF-8?q?=E2=80=A2=092D-Plot=20nutzt=20Psych=5FScore=20(x)=20und=20Sozia?=
 =?UTF-8?q?l=5FScore=20(y),=20Achsen=200..1=20mit=20Ticks=200,=20.25,=20.5?=
 =?UTF-8?q?,=20.75,=201.=20=09=E2=80=A2=093D-Plot=20nutzt=20Soft-Scores=20?=
 =?UTF-8?q?auf=20x/y,=20d=20auf=20z.=20=09=E2=80=A2=09Farben/Marker=20weit?=
 =?UTF-8?q?er=20strikt=20aus=20dem=20CI-Template.=20=094.=09Pipeline=20=09?=
 =?UTF-8?q?=E2=80=A2=09Nach=20classify=5Fsystems(...)=20wird=20compute=5Fs?=
 =?UTF-8?q?oft=5Fsystem=5Fscores(...)=20aufgerufen.=20=09=E2=80=A2=09Expor?=
 =?UTF-8?q?t=20erweitert=20um=20Psych=5FScore=20und=20Sozial=5FScore.=20?=
 =?UTF-8?q?=09=E2=80=A2=09Kleine=20Kurzdiagnostik:=20Quartile=20der=20Soft?=
 =?UTF-8?q?-Scores=20im=20Terminal.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 visible-learning systemtheoretisch.py | 174 +++++++++++++++++++++-----
 1 file changed, 141 insertions(+), 33 deletions(-)

diff --git a/visible-learning systemtheoretisch.py b/visible-learning systemtheoretisch.py
index 231ff22..232e7f1 100644
--- a/visible-learning systemtheoretisch.py	
+++ b/visible-learning systemtheoretisch.py	
@@ -192,6 +192,85 @@ def classify_systems(df: pd.DataFrame, mapping: pd.DataFrame | None = None) -> p
 
     return out
 
+# -----------------------------------------
+# Soft Scores via Textähnlichkeit (TF-IDF + Cosine)
+# -----------------------------------------
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+def _normalize_01(a: np.ndarray) -> np.ndarray:
+    a = np.asarray(a, dtype=float)
+    if a.size == 0:
+        return a
+    lo, hi = np.nanmin(a), np.nanmax(a)
+    if not np.isfinite(lo) or not np.isfinite(hi) or hi - lo <= 1e-12:
+        return np.zeros_like(a, dtype=float)
+    return (a - lo) / (hi - lo)
+
+def build_lexicons(df: pd.DataFrame, mapping: pd.DataFrame | None) -> tuple[str, str]:
+    """
+    Erzeugt zwei 'Pseudodokumente' (Lexika) für psychische vs. soziale Marker.
+    Präferenz: Mapping → bereits klassifizierte Stichwörter → konservative Heuristik.
+    """
+    # 1) Aus Mapping (explizite Terme)
+    psych_terms, sozial_terms = [], []
+    if mapping is not None and not mapping.empty:
+        psych_terms = mapping.loc[mapping["Psych"] == 1, "Term"].astype(str).tolist()
+        sozial_terms = mapping.loc[mapping["Sozial"] == 1, "Term"].astype(str).tolist()
+
+    # 2) Ergänzen durch bereits klassifizierte Stichwörter
+    if "Psych" in df.columns and "Sozial" in df.columns:
+        psych_terms += df.loc[df["Psych"] == 1, "Stichwort"].astype(str).tolist()
+        sozial_terms += df.loc[df["Sozial"] == 1, "Stichwort"].astype(str).tolist()
+
+    # 3) Fallback-Heuristik
+    if not psych_terms:
+        psych_terms = [
+            "Intelligenz","Kognition","Exekutive Funktionen","Gedächtnis","Selbstwirksamkeit",
+            "Selbstbild","Emotion","Motivation","Ausdauer","Beharrlichkeit","Zuversicht",
+            "Mindset","Kreativität","Neugier","Arbeitsgedächtnis","Einstellung","Wille"
+        ]
+    if not sozial_terms:
+        sozial_terms = [
+            "Klasse","Beziehung","Lehrer","Schüler","Unterricht","Klima","Team","Gruppe",
+            "Beratung","Schulleitung","Schule","Familie","Eltern","Zusammenhalt",
+            "Zugehörigkeit","Curriculum","Stundenplan","Pause","Konflikt","Sommerschule"
+        ]
+
+    # Als Pseudodokumente zusammenfassen
+    doc_psych = " ".join(map(str, psych_terms))
+    doc_sozial = " ".join(map(str, sozial_terms))
+    return doc_psych, doc_sozial
+
+def compute_soft_system_scores(df: pd.DataFrame, mapping: pd.DataFrame | None) -> pd.DataFrame:
+    """
+    Berechnet kontinuierliche Scores (0..1) für Psychisch/Sozial via TF-IDF + Cosine-Similarity
+    zu zwei Pseudodokumenten (Lexika).
+    """
+    out = df.copy()
+    # Pseudodokumente bauen
+    doc_psych, doc_sozial = build_lexicons(out, mapping)
+
+    # Korpus = alle Stichwörter + 2 Pseudodokumente
+    corpus = out["Stichwort"].astype(str).tolist() + [doc_psych, doc_sozial]
+    vect = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
+    X = vect.fit_transform(corpus)
+
+    # Indizes der Pseudodocs
+    idx_psych = X.shape[0] - 2
+    idx_sozial = X.shape[0] - 1
+
+    # Cosine-Similarity jedes Stichworts zu den Pseudodocs
+    S_psych = cosine_similarity(X[:-2], X[idx_psych])
+    S_sozial = cosine_similarity(X[:-2], X[idx_sozial])
+
+    # Auf [0,1] bringen (zeilenweise Vektoren → 1D)
+    p_raw = S_psych.ravel()
+    s_raw = S_sozial.ravel()
+    out["Psych_Score"] = _normalize_01(p_raw)
+    out["Sozial_Score"] = _normalize_01(s_raw)
+    return out
+
 # -----------------------------------------
 # Kopplungsindex (Erziehungssystem: Lernfähig/nicht lernfähig)
 # -----------------------------------------
@@ -206,38 +285,53 @@ def minmax_norm(a: np.ndarray) -> np.ndarray:
 
 def compute_coupling_index(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Kopplungsindex = norm(|d|) * w
-      w = 1.0 bei Kopplung (Psych=1 & Sozial=1)
-          0.6 bei nur Psych=1 oder nur Sozial=1
-          0.2 sonst (unspezifisch)
-    Vorzeichen des Index = Vorzeichen(d)
+    Kontinuierlicher Kopplungsindex:
+      - p = Psych_Score (0..1), s = Sozial_Score (0..1); Fallback auf binäre 'Psych'/'Sozial'
+      - H = harmonisches Mittel = 2ps/(p+s) (0, wenn p+s=0)
+      - |d| min-max-normalisiert
+      - Index = sign(d) * norm(|d|) * H
     """
     out = df.copy()
+
+    # Soft Scores oder Fallback
+    if "Psych_Score" in out.columns and "Sozial_Score" in out.columns:
+        p = out["Psych_Score"].astype(float).values
+        s = out["Sozial_Score"].astype(float).values
+    else:
+        p = out.get("Psych", pd.Series(0, index=out.index)).astype(float).clip(0,1).values
+        s = out.get("Sozial", pd.Series(0, index=out.index)).astype(float).clip(0,1).values
+
+    # Harmonisches Mittel (numerisch stabil)
+    denom = p + s
+    H = np.zeros_like(denom, dtype=float)
+    mask = denom > 0
+    H[mask] = 2 * p[mask] * s[mask] / denom[mask]
+
+    # |d| normalisieren
     abs_d = out["Effektstärke"].abs().values
-    abs_d_norm = minmax_norm(abs_d)
+    abs_d_norm = _normalize_01(abs_d)
 
-    both = (out["Psych"] == 1) & (out["Sozial"] == 1)
-    single = ((out["Psych"] == 1) ^ (out["Sozial"] == 1))
-    none = (out["Psych"] == 0) & (out["Sozial"] == 0)
+    signed_index = np.sign(out["Effektstärke"].values) * abs_d_norm * H
+    out["Kopplungsindex"] = signed_index
 
-    w = np.where(both, 1.0, np.where(single, 0.6, 0.2))
-    signed = np.sign(out["Effektstärke"].values) * abs_d_norm * w
-    out["Kopplungsindex"] = signed
-
-    # Label für schnelle Lesbarkeit
-    def addr_label(p, s):
-        if p == 1 and s == 1:
+    # Adressierungslabel anhand Soft Scores
+    def addr_lab(pp, ss):
+        if pp >= 0.5 and ss >= 0.5:
             return "Kopplung (Psych+Sozial)"
-        if p == 1 and s == 0:
+        if pp >= 0.5 and ss < 0.5:
             return "Psychisch adressiert"
-        if p == 0 and s == 1:
+        if pp < 0.5 and ss >= 0.5:
             return "Sozial adressiert"
         return "Unspezifisch"
-    out["Adressierung"] = [addr_label(p, s) for p, s in zip(out["Psych"], out["Sozial"])]
+
+    # Für Labels Soft-Scores nutzen, falls vorhanden
+    p_for_label = p
+    s_for_label = s
+    out["Adressierung"] = [addr_lab(pp, ss) for pp, ss in zip(p_for_label, s_for_label)]
 
     # Ränge
     out["Rank_abs_d"] = (-out["Effektstärke"].abs()).rank(method="min").astype(int)
-    out["Rank_kopplung"] = (-out["Kopplungsindex"].abs()).rank(method="min").astype(int)
+    out["Rank_kopplung"] = (-np.abs(out["Kopplungsindex"])).rank(method="min").astype(int)
     return out
 
 # -----------------------------------------
@@ -247,8 +341,8 @@ def plot_sign_system_2d(df: pd.DataFrame):
     """
     2D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Markergröße|Farbe ~ Kopplungsindex
     """
-    x = df["Psych"].astype(int)
-    y = df["Sozial"].astype(int)
+    x = (df["Psych_Score"] if "Psych_Score" in df.columns else df["Psych"].astype(float).clip(0,1))
+    y = (df["Sozial_Score"] if "Sozial_Score" in df.columns else df["Sozial"].astype(float).clip(0,1))
     size = (df["Kopplungsindex"].abs() * 22.0 + 6.0).astype(float)
 
     color_pos = _colors.get("positiveHighlight", "#2ca02c")
@@ -281,19 +375,19 @@ def plot_sign_system_2d(df: pd.DataFrame):
         name="Thermometer"
     ))
 
-    # Diskrete Achsen (0/1) mit CI-Layout
+    # Kontinuierliche Achsen (0..1) mit CI-Layout
     fig.update_layout(_layout(
         "Erziehungssystem – Adressierung & Kopplung (2D)",
-        "Psychisch (0/1)", "Sozial (0/1)"
+        "Psychisch (0..1)", "Sozial (0..1)"
     ))
-    fig.update_xaxes(tickmode="array", tickvals=[0, 1], ticktext=["0", "1"])
-    fig.update_yaxes(tickmode="array", tickvals=[0, 1], ticktext=["0", "1"])
+    fig.update_xaxes(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
+    fig.update_yaxes(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
     fig.show()
     export_figure(fig, "sys_erziehung_2d", export_fig_visual, export_fig_png)
 
 def plot_sign_system_3d(df: pd.DataFrame):
     """
-    3D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Z=Effektstärke; Farbe/Größe ~ Kopplungsindex
+    3D-Sicht: X=Psych_Score (0..1), Y=Sozial_Score (0..1), Z=Effektstärke; Farbe/Größe ~ Kopplungsindex
     """
     size = (df["Kopplungsindex"].abs() * 8.0 + 4.0).astype(float)
     color_pos = _colors.get("positiveHighlight", "#2ca02c")
@@ -310,8 +404,8 @@ def plot_sign_system_3d(df: pd.DataFrame):
 
     fig = go.Figure()
     fig.add_trace(go.Scatter3d(
-        x=df["Psych"].astype(int),
-        y=df["Sozial"].astype(int),
+        x=(df["Psych_Score"] if "Psych_Score" in df.columns else df["Psych"].astype(float).clip(0,1)),
+        y=(df["Sozial_Score"] if "Sozial_Score" in df.columns else df["Sozial"].astype(float).clip(0,1)),
         z=df["Effektstärke"],
         mode="markers",
         marker={**_styles.get("marker_accent", {}), "size": size, "color": point_colors},
@@ -323,11 +417,11 @@ def plot_sign_system_3d(df: pd.DataFrame):
 
     fig.update_layout(_layout(
         "Erziehungssystem – 3D-Sicht (Psych × Sozial × d)",
-        "Psychisch (0/1)", "Sozial (0/1)", "Cohen d"
+        "Psychisch (0..1)", "Sozial (0..1)", "Cohen d"
     ))
     fig.update_scenes(
-        xaxis=dict(tickmode="array", tickvals=[0,1], ticktext=["0","1"]),
-        yaxis=dict(tickmode="array", tickvals=[0,1], ticktext=["0","1"])
+        xaxis=dict(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0]),
+        yaxis=dict(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
     )
     fig.show()
     export_figure(fig, "sys_erziehung_3d", export_fig_visual, export_fig_png)
@@ -373,17 +467,31 @@ def analyse_system(path_csv: str, map_csv: str = "system_mapping.csv"):
     mapping = load_system_mapping(map_csv)
     df = classify_systems(df, mapping=mapping)
 
+    # Soft Scores aus Textähnlichkeit
+    df = compute_soft_system_scores(df, mapping=mapping)
+
     # Kopplungsindex
     df = compute_coupling_index(df)
 
     # Export Kern-Output
     try:
-        out_cols = ["Thermometer_ID","Stichwort","Kapitel","Kapitelname","Effektstärke","Psych","Sozial","Adressierung","Kopplungsindex"]
+        out_cols = [
+            "Thermometer_ID","Stichwort","Kapitel","Kapitelname","Effektstärke",
+            "Psych","Sozial","Psych_Score","Sozial_Score",
+            "Adressierung","Kopplungsindex"
+        ]
         df[out_cols].to_csv(os.path.join(EXPORT_DIR, "system_view.csv"), index=False)
         export_json(df[out_cols].to_dict(orient="records"), "system_view.json")
     except Exception:
         pass
 
+    # Kurzdiagnostik
+    print("Soft-Score-Quartile (Psych, Sozial):")
+    for col in ["Psych_Score","Sozial_Score"]:
+        if col in df.columns:
+            q = df[col].quantile([0.25,0.5,0.75]).round(3).to_dict()
+            print(f"  {col}: q25={q.get(0.25)}, q50={q.get(0.5)}, q75={q.get(0.75)}")
+
     # Visualisierungen
     plot_sign_system_2d(df)
     plot_sign_system_3d(df)