From 97c15bf209a83edac5282801e4d6c5a59e655c50 Mon Sep 17 00:00:00 2001 From: Jochen Hanisch-Johannsen Date: Wed, 3 Sep 2025 21:43:54 +0200 Subject: [PATCH] =?UTF-8?q?1.=09Soft-Scoring=20=C3=BCber=20Text=C3=A4hnlic?= =?UTF-8?q?hkeit=20=09=E2=80=A2=09Neue=20Funktionen=20build=5Flexicons(...?= =?UTF-8?q?)=20und=20compute=5Fsoft=5Fsystem=5Fscores(...)=20(TF-IDF=20+?= =?UTF-8?q?=20Cosine).=20=09=E2=80=A2=09Erzeugt=20Psych=5FScore=20und=20So?= =?UTF-8?q?zial=5FScore=20im=20Bereich=200..1.=20=09=E2=80=A2=09Quelle=20d?= =?UTF-8?q?er=20Lexika:=20zuerst=20system=5Fmapping.csv=20(falls=20vorhand?= =?UTF-8?q?en),=20dann=20bereits=20klassifizierte=20Stichw=C3=B6rter,=20so?= =?UTF-8?q?nst=20konservative=20Heuristik.=20=092.=09Neuer=20Kopplungsinde?= =?UTF-8?q?x=20=09=E2=80=A2=09compute=5Fcoupling=5Findex(...)=20nutzt=20je?= =?UTF-8?q?tzt=20Soft-Scores.=20=09=E2=80=A2=09Kopplungsgewicht=20=3D=20ha?= =?UTF-8?q?rmonisches=20Mittel=20H(p,s)=20=3D=202ps/(p+s).=20=09=E2=80=A2?= =?UTF-8?q?=09Index=20=3D=20sign(d)=20*=20norm(|d|)=20*=20H.=20=09?= =?UTF-8?q?=E2=80=A2=09Adressierungslabel=20basieren=20auf=20den=20Soft-Sc?= =?UTF-8?q?ores=20(Schwelle=200.5).=20=093.=09Plots=20umgestellt=20=09?= =?UTF-8?q?=E2=80=A2=092D-Plot=20nutzt=20Psych=5FScore=20(x)=20und=20Sozia?= =?UTF-8?q?l=5FScore=20(y),=20Achsen=200..1=20mit=20Ticks=200,=20.25,=20.5?= =?UTF-8?q?,=20.75,=201.=20=09=E2=80=A2=093D-Plot=20nutzt=20Soft-Scores=20?= =?UTF-8?q?auf=20x/y,=20d=20auf=20z.=20=09=E2=80=A2=09Farben/Marker=20weit?= =?UTF-8?q?er=20strikt=20aus=20dem=20CI-Template.=20=094.=09Pipeline=20=09?= =?UTF-8?q?=E2=80=A2=09Nach=20classify=5Fsystems(...)=20wird=20compute=5Fs?= =?UTF-8?q?oft=5Fsystem=5Fscores(...)=20aufgerufen.=20=09=E2=80=A2=09Expor?= =?UTF-8?q?t=20erweitert=20um=20Psych=5FScore=20und=20Sozial=5FScore.=20?= =?UTF-8?q?=09=E2=80=A2=09Kleine=20Kurzdiagnostik:=20Quartile=20der=20Soft?= =?UTF-8?q?-Scores=20im=20Terminal.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- visible-learning systemtheoretisch.py | 174 +++++++++++++++++++++----- 1 file changed, 141 insertions(+), 33 deletions(-) diff --git a/visible-learning systemtheoretisch.py b/visible-learning systemtheoretisch.py index 231ff22..232e7f1 100644 --- a/visible-learning systemtheoretisch.py +++ b/visible-learning systemtheoretisch.py @@ -192,6 +192,85 @@ def classify_systems(df: pd.DataFrame, mapping: pd.DataFrame | None = None) -> p return out +# ----------------------------------------- +# Soft Scores via Textähnlichkeit (TF-IDF + Cosine) +# ----------------------------------------- +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + +def _normalize_01(a: np.ndarray) -> np.ndarray: + a = np.asarray(a, dtype=float) + if a.size == 0: + return a + lo, hi = np.nanmin(a), np.nanmax(a) + if not np.isfinite(lo) or not np.isfinite(hi) or hi - lo <= 1e-12: + return np.zeros_like(a, dtype=float) + return (a - lo) / (hi - lo) + +def build_lexicons(df: pd.DataFrame, mapping: pd.DataFrame | None) -> tuple[str, str]: + """ + Erzeugt zwei 'Pseudodokumente' (Lexika) für psychische vs. soziale Marker. + Präferenz: Mapping → bereits klassifizierte Stichwörter → konservative Heuristik. + """ + # 1) Aus Mapping (explizite Terme) + psych_terms, sozial_terms = [], [] + if mapping is not None and not mapping.empty: + psych_terms = mapping.loc[mapping["Psych"] == 1, "Term"].astype(str).tolist() + sozial_terms = mapping.loc[mapping["Sozial"] == 1, "Term"].astype(str).tolist() + + # 2) Ergänzen durch bereits klassifizierte Stichwörter + if "Psych" in df.columns and "Sozial" in df.columns: + psych_terms += df.loc[df["Psych"] == 1, "Stichwort"].astype(str).tolist() + sozial_terms += df.loc[df["Sozial"] == 1, "Stichwort"].astype(str).tolist() + + # 3) Fallback-Heuristik + if not psych_terms: + psych_terms = [ + "Intelligenz","Kognition","Exekutive Funktionen","Gedächtnis","Selbstwirksamkeit", + "Selbstbild","Emotion","Motivation","Ausdauer","Beharrlichkeit","Zuversicht", + "Mindset","Kreativität","Neugier","Arbeitsgedächtnis","Einstellung","Wille" + ] + if not sozial_terms: + sozial_terms = [ + "Klasse","Beziehung","Lehrer","Schüler","Unterricht","Klima","Team","Gruppe", + "Beratung","Schulleitung","Schule","Familie","Eltern","Zusammenhalt", + "Zugehörigkeit","Curriculum","Stundenplan","Pause","Konflikt","Sommerschule" + ] + + # Als Pseudodokumente zusammenfassen + doc_psych = " ".join(map(str, psych_terms)) + doc_sozial = " ".join(map(str, sozial_terms)) + return doc_psych, doc_sozial + +def compute_soft_system_scores(df: pd.DataFrame, mapping: pd.DataFrame | None) -> pd.DataFrame: + """ + Berechnet kontinuierliche Scores (0..1) für Psychisch/Sozial via TF-IDF + Cosine-Similarity + zu zwei Pseudodokumenten (Lexika). + """ + out = df.copy() + # Pseudodokumente bauen + doc_psych, doc_sozial = build_lexicons(out, mapping) + + # Korpus = alle Stichwörter + 2 Pseudodokumente + corpus = out["Stichwort"].astype(str).tolist() + [doc_psych, doc_sozial] + vect = TfidfVectorizer(max_features=1000, ngram_range=(1,2)) + X = vect.fit_transform(corpus) + + # Indizes der Pseudodocs + idx_psych = X.shape[0] - 2 + idx_sozial = X.shape[0] - 1 + + # Cosine-Similarity jedes Stichworts zu den Pseudodocs + S_psych = cosine_similarity(X[:-2], X[idx_psych]) + S_sozial = cosine_similarity(X[:-2], X[idx_sozial]) + + # Auf [0,1] bringen (zeilenweise Vektoren → 1D) + p_raw = S_psych.ravel() + s_raw = S_sozial.ravel() + out["Psych_Score"] = _normalize_01(p_raw) + out["Sozial_Score"] = _normalize_01(s_raw) + return out + # ----------------------------------------- # Kopplungsindex (Erziehungssystem: Lernfähig/nicht lernfähig) # ----------------------------------------- @@ -206,38 +285,53 @@ def minmax_norm(a: np.ndarray) -> np.ndarray: def compute_coupling_index(df: pd.DataFrame) -> pd.DataFrame: """ - Kopplungsindex = norm(|d|) * w - w = 1.0 bei Kopplung (Psych=1 & Sozial=1) - 0.6 bei nur Psych=1 oder nur Sozial=1 - 0.2 sonst (unspezifisch) - Vorzeichen des Index = Vorzeichen(d) + Kontinuierlicher Kopplungsindex: + - p = Psych_Score (0..1), s = Sozial_Score (0..1); Fallback auf binäre 'Psych'/'Sozial' + - H = harmonisches Mittel = 2ps/(p+s) (0, wenn p+s=0) + - |d| min-max-normalisiert + - Index = sign(d) * norm(|d|) * H """ out = df.copy() + + # Soft Scores oder Fallback + if "Psych_Score" in out.columns and "Sozial_Score" in out.columns: + p = out["Psych_Score"].astype(float).values + s = out["Sozial_Score"].astype(float).values + else: + p = out.get("Psych", pd.Series(0, index=out.index)).astype(float).clip(0,1).values + s = out.get("Sozial", pd.Series(0, index=out.index)).astype(float).clip(0,1).values + + # Harmonisches Mittel (numerisch stabil) + denom = p + s + H = np.zeros_like(denom, dtype=float) + mask = denom > 0 + H[mask] = 2 * p[mask] * s[mask] / denom[mask] + + # |d| normalisieren abs_d = out["Effektstärke"].abs().values - abs_d_norm = minmax_norm(abs_d) + abs_d_norm = _normalize_01(abs_d) - both = (out["Psych"] == 1) & (out["Sozial"] == 1) - single = ((out["Psych"] == 1) ^ (out["Sozial"] == 1)) - none = (out["Psych"] == 0) & (out["Sozial"] == 0) + signed_index = np.sign(out["Effektstärke"].values) * abs_d_norm * H + out["Kopplungsindex"] = signed_index - w = np.where(both, 1.0, np.where(single, 0.6, 0.2)) - signed = np.sign(out["Effektstärke"].values) * abs_d_norm * w - out["Kopplungsindex"] = signed - - # Label für schnelle Lesbarkeit - def addr_label(p, s): - if p == 1 and s == 1: + # Adressierungslabel anhand Soft Scores + def addr_lab(pp, ss): + if pp >= 0.5 and ss >= 0.5: return "Kopplung (Psych+Sozial)" - if p == 1 and s == 0: + if pp >= 0.5 and ss < 0.5: return "Psychisch adressiert" - if p == 0 and s == 1: + if pp < 0.5 and ss >= 0.5: return "Sozial adressiert" return "Unspezifisch" - out["Adressierung"] = [addr_label(p, s) for p, s in zip(out["Psych"], out["Sozial"])] + + # Für Labels Soft-Scores nutzen, falls vorhanden + p_for_label = p + s_for_label = s + out["Adressierung"] = [addr_lab(pp, ss) for pp, ss in zip(p_for_label, s_for_label)] # Ränge out["Rank_abs_d"] = (-out["Effektstärke"].abs()).rank(method="min").astype(int) - out["Rank_kopplung"] = (-out["Kopplungsindex"].abs()).rank(method="min").astype(int) + out["Rank_kopplung"] = (-np.abs(out["Kopplungsindex"])).rank(method="min").astype(int) return out # ----------------------------------------- @@ -247,8 +341,8 @@ def plot_sign_system_2d(df: pd.DataFrame): """ 2D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Markergröße|Farbe ~ Kopplungsindex """ - x = df["Psych"].astype(int) - y = df["Sozial"].astype(int) + x = (df["Psych_Score"] if "Psych_Score" in df.columns else df["Psych"].astype(float).clip(0,1)) + y = (df["Sozial_Score"] if "Sozial_Score" in df.columns else df["Sozial"].astype(float).clip(0,1)) size = (df["Kopplungsindex"].abs() * 22.0 + 6.0).astype(float) color_pos = _colors.get("positiveHighlight", "#2ca02c") @@ -281,19 +375,19 @@ def plot_sign_system_2d(df: pd.DataFrame): name="Thermometer" )) - # Diskrete Achsen (0/1) mit CI-Layout + # Kontinuierliche Achsen (0..1) mit CI-Layout fig.update_layout(_layout( "Erziehungssystem – Adressierung & Kopplung (2D)", - "Psychisch (0/1)", "Sozial (0/1)" + "Psychisch (0..1)", "Sozial (0..1)" )) - fig.update_xaxes(tickmode="array", tickvals=[0, 1], ticktext=["0", "1"]) - fig.update_yaxes(tickmode="array", tickvals=[0, 1], ticktext=["0", "1"]) + fig.update_xaxes(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0]) + fig.update_yaxes(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0]) fig.show() export_figure(fig, "sys_erziehung_2d", export_fig_visual, export_fig_png) def plot_sign_system_3d(df: pd.DataFrame): """ - 3D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Z=Effektstärke; Farbe/Größe ~ Kopplungsindex + 3D-Sicht: X=Psych_Score (0..1), Y=Sozial_Score (0..1), Z=Effektstärke; Farbe/Größe ~ Kopplungsindex """ size = (df["Kopplungsindex"].abs() * 8.0 + 4.0).astype(float) color_pos = _colors.get("positiveHighlight", "#2ca02c") @@ -310,8 +404,8 @@ def plot_sign_system_3d(df: pd.DataFrame): fig = go.Figure() fig.add_trace(go.Scatter3d( - x=df["Psych"].astype(int), - y=df["Sozial"].astype(int), + x=(df["Psych_Score"] if "Psych_Score" in df.columns else df["Psych"].astype(float).clip(0,1)), + y=(df["Sozial_Score"] if "Sozial_Score" in df.columns else df["Sozial"].astype(float).clip(0,1)), z=df["Effektstärke"], mode="markers", marker={**_styles.get("marker_accent", {}), "size": size, "color": point_colors}, @@ -323,11 +417,11 @@ def plot_sign_system_3d(df: pd.DataFrame): fig.update_layout(_layout( "Erziehungssystem – 3D-Sicht (Psych × Sozial × d)", - "Psychisch (0/1)", "Sozial (0/1)", "Cohen d" + "Psychisch (0..1)", "Sozial (0..1)", "Cohen d" )) fig.update_scenes( - xaxis=dict(tickmode="array", tickvals=[0,1], ticktext=["0","1"]), - yaxis=dict(tickmode="array", tickvals=[0,1], ticktext=["0","1"]) + xaxis=dict(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0]), + yaxis=dict(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0]) ) fig.show() export_figure(fig, "sys_erziehung_3d", export_fig_visual, export_fig_png) @@ -373,17 +467,31 @@ def analyse_system(path_csv: str, map_csv: str = "system_mapping.csv"): mapping = load_system_mapping(map_csv) df = classify_systems(df, mapping=mapping) + # Soft Scores aus Textähnlichkeit + df = compute_soft_system_scores(df, mapping=mapping) + # Kopplungsindex df = compute_coupling_index(df) # Export Kern-Output try: - out_cols = ["Thermometer_ID","Stichwort","Kapitel","Kapitelname","Effektstärke","Psych","Sozial","Adressierung","Kopplungsindex"] + out_cols = [ + "Thermometer_ID","Stichwort","Kapitel","Kapitelname","Effektstärke", + "Psych","Sozial","Psych_Score","Sozial_Score", + "Adressierung","Kopplungsindex" + ] df[out_cols].to_csv(os.path.join(EXPORT_DIR, "system_view.csv"), index=False) export_json(df[out_cols].to_dict(orient="records"), "system_view.json") except Exception: pass + # Kurzdiagnostik + print("Soft-Score-Quartile (Psych, Sozial):") + for col in ["Psych_Score","Sozial_Score"]: + if col in df.columns: + q = df[col].quantile([0.25,0.5,0.75]).round(3).to_dict() + print(f" {col}: q25={q.get(0.25)}, q50={q.get(0.5)}, q75={q.get(0.75)}") + # Visualisierungen plot_sign_system_2d(df) plot_sign_system_3d(df)