1. Soft-Scoring über Textähnlichkeit
• Neue Funktionen build_lexicons(...) und compute_soft_system_scores(...) (TF-IDF + Cosine). • Erzeugt Psych_Score und Sozial_Score im Bereich 0..1. • Quelle der Lexika: zuerst system_mapping.csv (falls vorhanden), dann bereits klassifizierte Stichwörter, sonst konservative Heuristik. 2. Neuer Kopplungsindex • compute_coupling_index(...) nutzt jetzt Soft-Scores. • Kopplungsgewicht = harmonisches Mittel H(p,s) = 2ps/(p+s). • Index = sign(d) * norm(|d|) * H. • Adressierungslabel basieren auf den Soft-Scores (Schwelle 0.5). 3. Plots umgestellt • 2D-Plot nutzt Psych_Score (x) und Sozial_Score (y), Achsen 0..1 mit Ticks 0, .25, .5, .75, 1. • 3D-Plot nutzt Soft-Scores auf x/y, d auf z. • Farben/Marker weiter strikt aus dem CI-Template. 4. Pipeline • Nach classify_systems(...) wird compute_soft_system_scores(...) aufgerufen. • Export erweitert um Psych_Score und Sozial_Score. • Kleine Kurzdiagnostik: Quartile der Soft-Scores im Terminal.
This commit is contained in:
@ -192,6 +192,85 @@ def classify_systems(df: pd.DataFrame, mapping: pd.DataFrame | None = None) -> p
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
# -----------------------------------------
|
||||||
|
# Soft Scores via Textähnlichkeit (TF-IDF + Cosine)
|
||||||
|
# -----------------------------------------
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
|
def _normalize_01(a: np.ndarray) -> np.ndarray:
|
||||||
|
a = np.asarray(a, dtype=float)
|
||||||
|
if a.size == 0:
|
||||||
|
return a
|
||||||
|
lo, hi = np.nanmin(a), np.nanmax(a)
|
||||||
|
if not np.isfinite(lo) or not np.isfinite(hi) or hi - lo <= 1e-12:
|
||||||
|
return np.zeros_like(a, dtype=float)
|
||||||
|
return (a - lo) / (hi - lo)
|
||||||
|
|
||||||
|
def build_lexicons(df: pd.DataFrame, mapping: pd.DataFrame | None) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Erzeugt zwei 'Pseudodokumente' (Lexika) für psychische vs. soziale Marker.
|
||||||
|
Präferenz: Mapping → bereits klassifizierte Stichwörter → konservative Heuristik.
|
||||||
|
"""
|
||||||
|
# 1) Aus Mapping (explizite Terme)
|
||||||
|
psych_terms, sozial_terms = [], []
|
||||||
|
if mapping is not None and not mapping.empty:
|
||||||
|
psych_terms = mapping.loc[mapping["Psych"] == 1, "Term"].astype(str).tolist()
|
||||||
|
sozial_terms = mapping.loc[mapping["Sozial"] == 1, "Term"].astype(str).tolist()
|
||||||
|
|
||||||
|
# 2) Ergänzen durch bereits klassifizierte Stichwörter
|
||||||
|
if "Psych" in df.columns and "Sozial" in df.columns:
|
||||||
|
psych_terms += df.loc[df["Psych"] == 1, "Stichwort"].astype(str).tolist()
|
||||||
|
sozial_terms += df.loc[df["Sozial"] == 1, "Stichwort"].astype(str).tolist()
|
||||||
|
|
||||||
|
# 3) Fallback-Heuristik
|
||||||
|
if not psych_terms:
|
||||||
|
psych_terms = [
|
||||||
|
"Intelligenz","Kognition","Exekutive Funktionen","Gedächtnis","Selbstwirksamkeit",
|
||||||
|
"Selbstbild","Emotion","Motivation","Ausdauer","Beharrlichkeit","Zuversicht",
|
||||||
|
"Mindset","Kreativität","Neugier","Arbeitsgedächtnis","Einstellung","Wille"
|
||||||
|
]
|
||||||
|
if not sozial_terms:
|
||||||
|
sozial_terms = [
|
||||||
|
"Klasse","Beziehung","Lehrer","Schüler","Unterricht","Klima","Team","Gruppe",
|
||||||
|
"Beratung","Schulleitung","Schule","Familie","Eltern","Zusammenhalt",
|
||||||
|
"Zugehörigkeit","Curriculum","Stundenplan","Pause","Konflikt","Sommerschule"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Als Pseudodokumente zusammenfassen
|
||||||
|
doc_psych = " ".join(map(str, psych_terms))
|
||||||
|
doc_sozial = " ".join(map(str, sozial_terms))
|
||||||
|
return doc_psych, doc_sozial
|
||||||
|
|
||||||
|
def compute_soft_system_scores(df: pd.DataFrame, mapping: pd.DataFrame | None) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Berechnet kontinuierliche Scores (0..1) für Psychisch/Sozial via TF-IDF + Cosine-Similarity
|
||||||
|
zu zwei Pseudodokumenten (Lexika).
|
||||||
|
"""
|
||||||
|
out = df.copy()
|
||||||
|
# Pseudodokumente bauen
|
||||||
|
doc_psych, doc_sozial = build_lexicons(out, mapping)
|
||||||
|
|
||||||
|
# Korpus = alle Stichwörter + 2 Pseudodokumente
|
||||||
|
corpus = out["Stichwort"].astype(str).tolist() + [doc_psych, doc_sozial]
|
||||||
|
vect = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
|
||||||
|
X = vect.fit_transform(corpus)
|
||||||
|
|
||||||
|
# Indizes der Pseudodocs
|
||||||
|
idx_psych = X.shape[0] - 2
|
||||||
|
idx_sozial = X.shape[0] - 1
|
||||||
|
|
||||||
|
# Cosine-Similarity jedes Stichworts zu den Pseudodocs
|
||||||
|
S_psych = cosine_similarity(X[:-2], X[idx_psych])
|
||||||
|
S_sozial = cosine_similarity(X[:-2], X[idx_sozial])
|
||||||
|
|
||||||
|
# Auf [0,1] bringen (zeilenweise Vektoren → 1D)
|
||||||
|
p_raw = S_psych.ravel()
|
||||||
|
s_raw = S_sozial.ravel()
|
||||||
|
out["Psych_Score"] = _normalize_01(p_raw)
|
||||||
|
out["Sozial_Score"] = _normalize_01(s_raw)
|
||||||
|
return out
|
||||||
|
|
||||||
# -----------------------------------------
|
# -----------------------------------------
|
||||||
# Kopplungsindex (Erziehungssystem: Lernfähig/nicht lernfähig)
|
# Kopplungsindex (Erziehungssystem: Lernfähig/nicht lernfähig)
|
||||||
# -----------------------------------------
|
# -----------------------------------------
|
||||||
@ -206,38 +285,53 @@ def minmax_norm(a: np.ndarray) -> np.ndarray:
|
|||||||
|
|
||||||
def compute_coupling_index(df: pd.DataFrame) -> pd.DataFrame:
|
def compute_coupling_index(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Kopplungsindex = norm(|d|) * w
|
Kontinuierlicher Kopplungsindex:
|
||||||
w = 1.0 bei Kopplung (Psych=1 & Sozial=1)
|
- p = Psych_Score (0..1), s = Sozial_Score (0..1); Fallback auf binäre 'Psych'/'Sozial'
|
||||||
0.6 bei nur Psych=1 oder nur Sozial=1
|
- H = harmonisches Mittel = 2ps/(p+s) (0, wenn p+s=0)
|
||||||
0.2 sonst (unspezifisch)
|
- |d| min-max-normalisiert
|
||||||
Vorzeichen des Index = Vorzeichen(d)
|
- Index = sign(d) * norm(|d|) * H
|
||||||
"""
|
"""
|
||||||
out = df.copy()
|
out = df.copy()
|
||||||
|
|
||||||
|
# Soft Scores oder Fallback
|
||||||
|
if "Psych_Score" in out.columns and "Sozial_Score" in out.columns:
|
||||||
|
p = out["Psych_Score"].astype(float).values
|
||||||
|
s = out["Sozial_Score"].astype(float).values
|
||||||
|
else:
|
||||||
|
p = out.get("Psych", pd.Series(0, index=out.index)).astype(float).clip(0,1).values
|
||||||
|
s = out.get("Sozial", pd.Series(0, index=out.index)).astype(float).clip(0,1).values
|
||||||
|
|
||||||
|
# Harmonisches Mittel (numerisch stabil)
|
||||||
|
denom = p + s
|
||||||
|
H = np.zeros_like(denom, dtype=float)
|
||||||
|
mask = denom > 0
|
||||||
|
H[mask] = 2 * p[mask] * s[mask] / denom[mask]
|
||||||
|
|
||||||
|
# |d| normalisieren
|
||||||
abs_d = out["Effektstärke"].abs().values
|
abs_d = out["Effektstärke"].abs().values
|
||||||
abs_d_norm = minmax_norm(abs_d)
|
abs_d_norm = _normalize_01(abs_d)
|
||||||
|
|
||||||
both = (out["Psych"] == 1) & (out["Sozial"] == 1)
|
signed_index = np.sign(out["Effektstärke"].values) * abs_d_norm * H
|
||||||
single = ((out["Psych"] == 1) ^ (out["Sozial"] == 1))
|
out["Kopplungsindex"] = signed_index
|
||||||
none = (out["Psych"] == 0) & (out["Sozial"] == 0)
|
|
||||||
|
|
||||||
w = np.where(both, 1.0, np.where(single, 0.6, 0.2))
|
# Adressierungslabel anhand Soft Scores
|
||||||
signed = np.sign(out["Effektstärke"].values) * abs_d_norm * w
|
def addr_lab(pp, ss):
|
||||||
out["Kopplungsindex"] = signed
|
if pp >= 0.5 and ss >= 0.5:
|
||||||
|
|
||||||
# Label für schnelle Lesbarkeit
|
|
||||||
def addr_label(p, s):
|
|
||||||
if p == 1 and s == 1:
|
|
||||||
return "Kopplung (Psych+Sozial)"
|
return "Kopplung (Psych+Sozial)"
|
||||||
if p == 1 and s == 0:
|
if pp >= 0.5 and ss < 0.5:
|
||||||
return "Psychisch adressiert"
|
return "Psychisch adressiert"
|
||||||
if p == 0 and s == 1:
|
if pp < 0.5 and ss >= 0.5:
|
||||||
return "Sozial adressiert"
|
return "Sozial adressiert"
|
||||||
return "Unspezifisch"
|
return "Unspezifisch"
|
||||||
out["Adressierung"] = [addr_label(p, s) for p, s in zip(out["Psych"], out["Sozial"])]
|
|
||||||
|
# Für Labels Soft-Scores nutzen, falls vorhanden
|
||||||
|
p_for_label = p
|
||||||
|
s_for_label = s
|
||||||
|
out["Adressierung"] = [addr_lab(pp, ss) for pp, ss in zip(p_for_label, s_for_label)]
|
||||||
|
|
||||||
# Ränge
|
# Ränge
|
||||||
out["Rank_abs_d"] = (-out["Effektstärke"].abs()).rank(method="min").astype(int)
|
out["Rank_abs_d"] = (-out["Effektstärke"].abs()).rank(method="min").astype(int)
|
||||||
out["Rank_kopplung"] = (-out["Kopplungsindex"].abs()).rank(method="min").astype(int)
|
out["Rank_kopplung"] = (-np.abs(out["Kopplungsindex"])).rank(method="min").astype(int)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
# -----------------------------------------
|
# -----------------------------------------
|
||||||
@ -247,8 +341,8 @@ def plot_sign_system_2d(df: pd.DataFrame):
|
|||||||
"""
|
"""
|
||||||
2D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Markergröße|Farbe ~ Kopplungsindex
|
2D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Markergröße|Farbe ~ Kopplungsindex
|
||||||
"""
|
"""
|
||||||
x = df["Psych"].astype(int)
|
x = (df["Psych_Score"] if "Psych_Score" in df.columns else df["Psych"].astype(float).clip(0,1))
|
||||||
y = df["Sozial"].astype(int)
|
y = (df["Sozial_Score"] if "Sozial_Score" in df.columns else df["Sozial"].astype(float).clip(0,1))
|
||||||
size = (df["Kopplungsindex"].abs() * 22.0 + 6.0).astype(float)
|
size = (df["Kopplungsindex"].abs() * 22.0 + 6.0).astype(float)
|
||||||
|
|
||||||
color_pos = _colors.get("positiveHighlight", "#2ca02c")
|
color_pos = _colors.get("positiveHighlight", "#2ca02c")
|
||||||
@ -281,19 +375,19 @@ def plot_sign_system_2d(df: pd.DataFrame):
|
|||||||
name="Thermometer"
|
name="Thermometer"
|
||||||
))
|
))
|
||||||
|
|
||||||
# Diskrete Achsen (0/1) mit CI-Layout
|
# Kontinuierliche Achsen (0..1) mit CI-Layout
|
||||||
fig.update_layout(_layout(
|
fig.update_layout(_layout(
|
||||||
"Erziehungssystem – Adressierung & Kopplung (2D)",
|
"Erziehungssystem – Adressierung & Kopplung (2D)",
|
||||||
"Psychisch (0/1)", "Sozial (0/1)"
|
"Psychisch (0..1)", "Sozial (0..1)"
|
||||||
))
|
))
|
||||||
fig.update_xaxes(tickmode="array", tickvals=[0, 1], ticktext=["0", "1"])
|
fig.update_xaxes(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
|
||||||
fig.update_yaxes(tickmode="array", tickvals=[0, 1], ticktext=["0", "1"])
|
fig.update_yaxes(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
|
||||||
fig.show()
|
fig.show()
|
||||||
export_figure(fig, "sys_erziehung_2d", export_fig_visual, export_fig_png)
|
export_figure(fig, "sys_erziehung_2d", export_fig_visual, export_fig_png)
|
||||||
|
|
||||||
def plot_sign_system_3d(df: pd.DataFrame):
|
def plot_sign_system_3d(df: pd.DataFrame):
|
||||||
"""
|
"""
|
||||||
3D-Sicht: X=Psych (0/1), Y=Sozial (0/1), Z=Effektstärke; Farbe/Größe ~ Kopplungsindex
|
3D-Sicht: X=Psych_Score (0..1), Y=Sozial_Score (0..1), Z=Effektstärke; Farbe/Größe ~ Kopplungsindex
|
||||||
"""
|
"""
|
||||||
size = (df["Kopplungsindex"].abs() * 8.0 + 4.0).astype(float)
|
size = (df["Kopplungsindex"].abs() * 8.0 + 4.0).astype(float)
|
||||||
color_pos = _colors.get("positiveHighlight", "#2ca02c")
|
color_pos = _colors.get("positiveHighlight", "#2ca02c")
|
||||||
@ -310,8 +404,8 @@ def plot_sign_system_3d(df: pd.DataFrame):
|
|||||||
|
|
||||||
fig = go.Figure()
|
fig = go.Figure()
|
||||||
fig.add_trace(go.Scatter3d(
|
fig.add_trace(go.Scatter3d(
|
||||||
x=df["Psych"].astype(int),
|
x=(df["Psych_Score"] if "Psych_Score" in df.columns else df["Psych"].astype(float).clip(0,1)),
|
||||||
y=df["Sozial"].astype(int),
|
y=(df["Sozial_Score"] if "Sozial_Score" in df.columns else df["Sozial"].astype(float).clip(0,1)),
|
||||||
z=df["Effektstärke"],
|
z=df["Effektstärke"],
|
||||||
mode="markers",
|
mode="markers",
|
||||||
marker={**_styles.get("marker_accent", {}), "size": size, "color": point_colors},
|
marker={**_styles.get("marker_accent", {}), "size": size, "color": point_colors},
|
||||||
@ -323,11 +417,11 @@ def plot_sign_system_3d(df: pd.DataFrame):
|
|||||||
|
|
||||||
fig.update_layout(_layout(
|
fig.update_layout(_layout(
|
||||||
"Erziehungssystem – 3D-Sicht (Psych × Sozial × d)",
|
"Erziehungssystem – 3D-Sicht (Psych × Sozial × d)",
|
||||||
"Psychisch (0/1)", "Sozial (0/1)", "Cohen d"
|
"Psychisch (0..1)", "Sozial (0..1)", "Cohen d"
|
||||||
))
|
))
|
||||||
fig.update_scenes(
|
fig.update_scenes(
|
||||||
xaxis=dict(tickmode="array", tickvals=[0,1], ticktext=["0","1"]),
|
xaxis=dict(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0]),
|
||||||
yaxis=dict(tickmode="array", tickvals=[0,1], ticktext=["0","1"])
|
yaxis=dict(range=[0,1], tickmode="array", tickvals=[0,0.25,0.5,0.75,1.0])
|
||||||
)
|
)
|
||||||
fig.show()
|
fig.show()
|
||||||
export_figure(fig, "sys_erziehung_3d", export_fig_visual, export_fig_png)
|
export_figure(fig, "sys_erziehung_3d", export_fig_visual, export_fig_png)
|
||||||
@ -373,17 +467,31 @@ def analyse_system(path_csv: str, map_csv: str = "system_mapping.csv"):
|
|||||||
mapping = load_system_mapping(map_csv)
|
mapping = load_system_mapping(map_csv)
|
||||||
df = classify_systems(df, mapping=mapping)
|
df = classify_systems(df, mapping=mapping)
|
||||||
|
|
||||||
|
# Soft Scores aus Textähnlichkeit
|
||||||
|
df = compute_soft_system_scores(df, mapping=mapping)
|
||||||
|
|
||||||
# Kopplungsindex
|
# Kopplungsindex
|
||||||
df = compute_coupling_index(df)
|
df = compute_coupling_index(df)
|
||||||
|
|
||||||
# Export Kern-Output
|
# Export Kern-Output
|
||||||
try:
|
try:
|
||||||
out_cols = ["Thermometer_ID","Stichwort","Kapitel","Kapitelname","Effektstärke","Psych","Sozial","Adressierung","Kopplungsindex"]
|
out_cols = [
|
||||||
|
"Thermometer_ID","Stichwort","Kapitel","Kapitelname","Effektstärke",
|
||||||
|
"Psych","Sozial","Psych_Score","Sozial_Score",
|
||||||
|
"Adressierung","Kopplungsindex"
|
||||||
|
]
|
||||||
df[out_cols].to_csv(os.path.join(EXPORT_DIR, "system_view.csv"), index=False)
|
df[out_cols].to_csv(os.path.join(EXPORT_DIR, "system_view.csv"), index=False)
|
||||||
export_json(df[out_cols].to_dict(orient="records"), "system_view.json")
|
export_json(df[out_cols].to_dict(orient="records"), "system_view.json")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Kurzdiagnostik
|
||||||
|
print("Soft-Score-Quartile (Psych, Sozial):")
|
||||||
|
for col in ["Psych_Score","Sozial_Score"]:
|
||||||
|
if col in df.columns:
|
||||||
|
q = df[col].quantile([0.25,0.5,0.75]).round(3).to_dict()
|
||||||
|
print(f" {col}: q25={q.get(0.25)}, q50={q.get(0.5)}, q75={q.get(0.75)}")
|
||||||
|
|
||||||
# Visualisierungen
|
# Visualisierungen
|
||||||
plot_sign_system_2d(df)
|
plot_sign_system_2d(df)
|
||||||
plot_sign_system_3d(df)
|
plot_sign_system_3d(df)
|
||||||
|
|||||||
Reference in New Issue
Block a user