literaturanalyse/Systematsiche Literaturanalyse db | Netzwerk- und Pfadanalyse.py

"""
CAVE!!!!!

Datei muss aus Zotero mit BibTeX exportiert werden!
"""

import os

# Clear the terminal
os.system('cls' if os.name == 'nt' else 'clear')

import bibtexparser
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from datetime import datetime
from collections import defaultdict, Counter
from itertools import product
from wordcloud import WordCloud
from tabulate import tabulate
import plotly.express as px
import plotly.graph_objects as go
import random
import math
import re
import subprocess

# Export-Flags für Visualisierungen
export_fig_visualize_network = False
export_fig_visualize_tags = False
export_fig_visualize_index = False
export_fig_visualize_research_questions = False
export_fig_visualize_categories = False
export_fig_visualize_time_series = False
export_fig_visualize_top_authors = False
export_fig_visualize_top_publications = False
export_fig_create_path_diagram = False
export_fig_create_sankey_diagram = False
export_fig_visualize_sources_status = False
export_fig_create_wordcloud_from_titles = False
export_fig_visualize_search_term_distribution = False

# Optional: slugify-Funktion
def slugify(value):
    return re.sub(r'[^a-zA-Z0-9_-]', '', value.replace(' ', '_').lower())

# Exportfunktionen für jede Visualisierung
def export_visualize_network(fig):
    if export_fig_visualize_network:
        safe_filename = slugify("visualize_network")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_visualize_tags(fig):
    if export_fig_visualize_tags:
        safe_filename = slugify("visualize_tags")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_visualize_index(fig):
    if export_fig_visualize_index:
        safe_filename = slugify("visualize_index")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_visualize_research_questions(fig):
    if export_fig_visualize_research_questions:
        safe_filename = slugify("visualize_research_questions")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_visualize_categories(fig):
    if export_fig_visualize_categories:
        safe_filename = slugify("visualize_categories")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_visualize_time_series(fig):
    if export_fig_visualize_time_series:
        safe_filename = slugify("visualize_time_series")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_visualize_top_authors(fig):
    if export_fig_visualize_top_authors:
        safe_filename = slugify("visualize_top_authors")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_visualize_top_publications(fig):
    if export_fig_visualize_top_publications:
        safe_filename = slugify("visualize_top_publications")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_create_path_diagram(fig):
    if export_fig_create_path_diagram:
        safe_filename = slugify("create_path_diagram")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_create_sankey_diagram(fig):
    if export_fig_create_sankey_diagram:
        safe_filename = slugify("create_sankey_diagram")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

def export_visualize_sources_status(fig):
    if export_fig_visualize_sources_status:
        safe_filename = slugify("visualize_sources_status")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

# Farben definieren
colors = {
    "background": "#003366",            # Hintergrundfarbe
    "text": "#333333",                  # Textfarbe
    "accent": "#663300",                # Akzentfarbe
    "primaryLine": "#660066",           # Bildungswirkfaktor
    "secondaryLine": "#cc6600",         # Bildungswirkindikator
    "depthArea": "#006666",             # Kompetenzmessunsicherheit
    "brightArea": "#66CCCC",            # Kompetenzentwicklungsunsicherheit
    "positiveHighlight": "#336600",     # Positive Hervorhebung
    "negativeHighlight": "#990000",     # Negative Hervorhebung
    "white": "#ffffff"                  # Weiß
}

# Liste der Farben, die für die Wörter verwendet werden sollen
word_colors = [
    colors["white"],
    colors["brightArea"],
    colors["positiveHighlight"],
    colors["negativeHighlight"]
]

# Aktuelles Datum
current_date = datetime.now().strftime("%Y-%m-%d")


# Lade Zotero-SQLite-Datenbank und erzeuge bib_database.entries-ähnliche Struktur
import sqlite3

def load_zotero_entries(sqlite_path):
    conn = sqlite3.connect(sqlite_path)
    cursor = conn.cursor()

    query = """
    SELECT
        items.itemID,
        COALESCE(value_title.value, '') AS title,
        COALESCE(value_year.value, '') AS year,
        COALESCE(creators.lastName || ', ' || creators.firstName, '') AS author,
        GROUP_CONCAT(DISTINCT tags.name) AS keywords,
        itemTypes.typeName AS type
    FROM items

    -- Titel
    LEFT JOIN itemData AS title_data ON items.itemID = title_data.itemID
    LEFT JOIN fields AS title_field ON title_data.fieldID = title_field.fieldID AND title_field.fieldName = 'title'
    LEFT JOIN itemDataValues AS value_title ON title_data.valueID = value_title.valueID

    -- Jahr
    LEFT JOIN itemData AS year_data ON items.itemID = year_data.itemID
    LEFT JOIN fields AS year_field ON year_data.fieldID = year_field.fieldID AND year_field.fieldName = 'date'
    LEFT JOIN itemDataValues AS value_year ON year_data.valueID = value_year.valueID

    -- Autoren
    LEFT JOIN itemCreators ON items.itemID = itemCreators.itemID
    LEFT JOIN creators ON itemCreators.creatorID = creators.creatorID

    -- Tags
    LEFT JOIN itemTags ON items.itemID = itemTags.itemID
    LEFT JOIN tags ON itemTags.tagID = tags.tagID

    -- Typ
    LEFT JOIN itemTypes ON items.itemTypeID = itemTypes.itemTypeID

    -- Sammlungen
    LEFT JOIN collectionItems ON items.itemID = collectionItems.itemID
    LEFT JOIN collections ON collectionItems.collectionID = collections.collectionID

    WHERE collections.collectionName IN (
        'S:01 Learning Management System',
        'S:02 Online-Lernplattform',
        'S:03 Online-Lernumgebung',
        'S:05 eLearning',
        'S:04 MOOC',
        'S:06 Bildungstechnologie',
        'S:07 Digitale Medien',
        'S:08 Blended Learning',
        'S:09 Digitales Lernen',
        'S:12 Digital Learning',
        'S:10 Online Lernen',
        'S:11 Online Learning',
        'S:13 Berichte',
        'S:14 Agiles Lernen',
        'S:15 Learning Analytics',
        'S:16 Dissertationen',
        'S:17 ePortfolio'
    )
    GROUP BY items.itemID
    """

    cursor.execute(query)
    rows = cursor.fetchall()

    # Umwandlung in bib_database.entries-kompatibles Format
    entries = []
    for row in rows:
        item = {
            'ID': str(row[0]),
            'title': row[1],
            'year': row[2],
            'author': row[3],
            'keywords': row[4] if row[4] else '',
            'ENTRYTYPE': row[5]
        }
        entries.append(item)

    conn.close()
    return entries

bib_database = type("BibDatabase", (object,), {})()
bib_database.entries = load_zotero_entries('/Users/jochen_hanisch-johannsen/Zotero/zotero.sqlite')

# Stopplisten laden
with open('de_complete.txt', 'r', encoding='utf-8') as file:
    stop_words_de = set(file.read().split())

with open('en_complete.txt', 'r', encoding='utf-8') as file:
    stop_words_en = set(file.read().split())

# Kombinierte Stoppliste
stop_words = stop_words_de.union(stop_words_en)

# Funktion zur Berechnung der Stichprobengröße
def calculate_sample_size(N, Z=1.96, p=0.5, e=0.05):
    n_0 = (Z**2 * p * (1 - p)) / (e**2)
    n = n_0 / (1 + ((n_0 - 1) / N))
    return math.ceil(n)

# Visualisierung 1: Netzwerkanalyse
# Visualisierung 1: Netzwerkanalyse
def visualize_network(bib_database):
    search_terms = {
        '0': 'digital:learning',
        '1': 'learning:management:system',
        '2': 'online:Lernplattform',
        '3': 'online:Lernumgebung',
        '4': 'MOOC',
        '5': 'e-learning',
        '6': 'Bildung:Technologie',
        '7': 'digital:Medien',
        '8': 'blended:learning',
        '9': 'digital:lernen',
        'a': 'online:lernen',
        'b': 'online:learning'
    }

    numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b']
    types = [
        'Zeitschriftenartikel',
        'Buch',
        'Buchteil',
        'Bericht',
        'Konferenz-Paper'
    ]
    tags_to_search = set()
    for number, type_ in product(numbers, types):
        search_term = search_terms[number]
        tag = f'#{number}:{type_}:{search_term}'
        tags_to_search.add(tag.lower())

    tag_counts = defaultdict(int)
    for entry in bib_database.entries:
        if 'keywords' in entry:
            entry_keywords = list(map(str.lower, map(str.strip, entry['keywords'].replace('\\#', '#').split(','))))
            for keyword in entry_keywords:
                for tag in tags_to_search:
                    if tag in keyword:
                        tag_counts[tag] += 1

    fundzahlen = defaultdict(int)
    for tag, count in tag_counts.items():
        search_term = tag.split(':')[-1]
        for key, value in search_terms.items():
            if search_term == value:
                fundzahlen[value] += count

    search_terms_network = {
        "Primäre Begriffe": {
            "learning:management:system": [
                "e-learning",
                "bildung:technologie",
                "online:lernplattform",
                "online:lernumgebung",
                "digital:learning",
                "digitales:lernen"
            ]
        },
        "Sekundäre Begriffe": {
            "e-learning": [
                "mooc",
                "online:lernplattform"
            ],
            "bildung:technologie": [
                "digital:learning",
                "digitales:lernen",
                "blended:learning"
            ],
            "digital:learning": [
                "digitale:medien",
                "online:learning"
            ],
            "digitales:lernen": [
                "digitale:medien",
                "online:lernen"
            ],
            "blended:learning": ["mooc"]
        },
        "Tertiäre Begriffe": {
            "online:learning": [],
            "online:lernen": []
        }
    }

    G = nx.Graph()

    hierarchy_colors = {
        "Primäre Begriffe": colors['primaryLine'],
        "Sekundäre Begriffe": colors['secondaryLine'],
        "Tertiäre Begriffe": colors['brightArea']
    }

    def add_terms_to_graph(level, terms):
        for primary_term, related_terms in terms.items():
            if primary_term not in G:
                G.add_node(primary_term, color=hierarchy_colors[level], size=fundzahlen.get(primary_term, 10))
            else:
                if level == "Tertiäre Begriffe":
                    G.nodes[primary_term]['color'] = hierarchy_colors[level]
            for related_term in related_terms:
                if related_term not in G:
                    G.add_node(related_term, color=hierarchy_colors[level], size=fundzahlen.get(related_term, 10))
                else:
                    if level == "Tertiäre Begriffe":
                        G.nodes[related_term]['color'] = hierarchy_colors[level]
                G.add_edge(primary_term, related_term)

    for level, terms in search_terms_network.items():
        add_terms_to_graph(level, terms)

    np.random.seed(42)
    pos = nx.spring_layout(G)

    x_scale_min, x_scale_max = 0, 10
    y_scale_min, y_scale_max = 0, 10

    min_x = min(pos[node][0] for node in pos)
    max_x = max(pos[node][0] for node in pos)
    min_y = min(pos[node][1] for node in pos)
    max_y = max(pos[node][1] for node in pos)

    scale_x_range = x_scale_max - x_scale_min
    scale_y_range = y_scale_max - y_scale_min

    for node in pos:
        x, y = pos[node]
        norm_x = scale_x_range * (x - min_x) / (max_x - min_x) + x_scale_min
        norm_y = scale_y_range * (y - min_y) / (max_y - min_y) + y_scale_min
        pos[node] = (norm_x, norm_y)

    for node in pos:
        x, y = pos[node]
        x = max(min(x, x_scale_max), x_scale_min)
        y = max(min(y, y_scale_max), y_scale_min)
        pos[node] = (x, y)

    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color=colors['white']),
        hoverinfo='none',
        mode='lines')

    # Knoten in drei Traces aufteilen: Primär, Sekundär, Tertiär
    primary_nodes = []
    secondary_nodes = []
    tertiary_nodes = []

    for node in G.nodes():
        color = G.nodes[node]['color']
        size = math.log(G.nodes[node].get('size', 10) + 1) * 10
        x, y = pos[node]
        hovertext = f"{node}<br>Anzahl Funde: {fundzahlen.get(node, 0)}"
        node_data = dict(x=x, y=y, text=node, size=size, hovertext=hovertext)
        if color == colors['primaryLine']:
            primary_nodes.append(node_data)
        elif color == colors['secondaryLine']:
            secondary_nodes.append(node_data)
        elif color == colors['brightArea']:
            tertiary_nodes.append(node_data)

    def create_node_trace(nodes, name, color):
        return go.Scatter(
            x=[n['x'] for n in nodes],
            y=[n['y'] for n in nodes],
            mode='markers+text',
            text=[n['text'] for n in nodes],
            hovertext=[n['hovertext'] for n in nodes],
            hoverinfo='text',
            marker=dict(
                size=[n['size'] for n in nodes],
                color=color,
                line_width=2
            ),
            textposition="top center",
            textfont=dict(size=12),
            name=name
        )

    primary_trace = create_node_trace(primary_nodes, "Primäre Begriffe", colors['primaryLine'])
    secondary_trace = create_node_trace(secondary_nodes, "Sekundäre Begriffe", colors['secondaryLine'])
    tertiary_trace = create_node_trace(tertiary_nodes, "Tertiäre Begriffe", colors['brightArea'])

    fig = go.Figure(data=[edge_trace, primary_trace, secondary_trace, tertiary_trace],
                    layout=go.Layout(
                        title=f'Suchbegriff-Netzwerk nach Relevanz und Semantik (n={sum(fundzahlen.values())}, Stand: {current_date})',
                        titlefont_size=16,
                        showlegend=True,
                        legend=dict(
                            bgcolor=colors['background'],
                            bordercolor=colors['white'],
                            borderwidth=1,
                            font=dict(color=colors['white']),
                            itemsizing='constant'
                        ),
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(
                            range=[x_scale_min, x_scale_max + 1],
                            showgrid=True,
                            zeroline=True,
                            tickmode='linear',
                            tick0=x_scale_min,
                            dtick=(x_scale_max - x_scale_min) / 4,
                            title='Technologische Dimension'
                        ),
                        yaxis=dict(
                            range=[y_scale_min, y_scale_max + 1],
                            showgrid=True,
                            zeroline=True,
                            tickmode='linear',
                            tick0=y_scale_min,
                            dtick=(y_scale_max - y_scale_min) / 4,
                            title='Pädagogische Dimension'
                        ),
                        plot_bgcolor=colors['background'],
                        paper_bgcolor=colors['background'],
                        font=dict(color=colors['white'])
                    ))

    fig.show()
    export_visualize_network(fig)

    # Einfache Pfadanalyse nach dem Anzeigen der Figur
    if 'e-learning' in G and 'online:lernen' in G:
        try:
            pfad = nx.shortest_path(G, source='e-learning', target='online:lernen')
            print(f"Kürzester Pfad von 'e-learning' zu 'online:lernen': {pfad}")
        except nx.NetworkXNoPath:
            print("Kein Pfad von 'e-learning' zu 'online:lernen' gefunden.")

 # Visualisierung 2: Häufigkeit spezifischer Tags
def visualize_tags(bib_database):
    # Definierte Suchbegriffe
    search_terms = {
        '0': 'digital:learning',
        '1': 'learning:management:system',
        '2': 'online:Lernplattform',
        '3': 'online:Lernumgebung',
        '4': 'MOOC',
        '5': 'e-learning',
        '6': 'Bildung:Technologie',
        '7': 'digital:Medien',
        '8': 'blended:learning',
        '9': 'digital:lernen',
        'a': 'online:lernen',
        'b': 'online:learning'
    }

    # Kombinierte Tags erzeugen
    numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b']
    types = [
        'Zeitschriftenartikel',
        'Buch',
        'Buchteil',
        'Bericht',
        'Konferenz-Paper'
    ]
    tags_to_search = set(
        f"#{number}:{type_}:{search_terms[number]}"
        for number, type_ in product(numbers, types)
    )

    # Tag-Zählungen initialisieren
    tag_counts = defaultdict(int)
    if not bib_database or not bib_database.entries:
        print("Fehler: Keine Einträge in der Datenbank gefunden.")
        return

    for entry in bib_database.entries:
        if 'keywords' in entry:
            entry_keywords = map(
                str.lower,
                map(str.strip, entry['keywords'].replace('\\#', '#').split(','))
            )
            for keyword in entry_keywords:
                for tag in tags_to_search:
                    if tag in keyword:
                        tag_counts[tag] += 1

    # Daten für Visualisierung aufbereiten
    data = [
        {'Tag': tag, 'Count': count, 'Type': tag.split(':')[1].lower()}
        for tag, count in tag_counts.items()
        if count > 0
    ]

    if not data:
        print("Warnung: Keine Tags gefunden, die den Suchkriterien entsprechen.")
        return

    # Farbzuordnung
    color_map = {
        'zeitschriftenartikel': colors['primaryLine'],
        'konferenz-paper': colors['secondaryLine'],
        'buch': colors['depthArea'],
        'buchteil': colors['brightArea'],
        'bericht': colors['accent']
    }

    # Visualisierung erstellen
    total_count = sum(tag_counts.values())
    fig = px.bar(
        data,
        x='Tag',
        y='Count',
        title=f'Häufigkeit der Suchbegriffe in der Literaturanalyse (n={total_count}, Stand: {current_date})',
        labels={'Tag': 'Tag', 'Count': 'Anzahl der Vorkommen'},
        color='Type',
        color_discrete_map=color_map,
        text_auto=True
    )

    # Layout anpassen
    fig.update_layout(
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        font=dict(color=colors['white']),
        margin=dict(l=0, r=0, t=40, b=40),
        autosize=True
    )

    fig.update_traces(
        marker_line_color=colors['white'],
        marker_line_width=1.5
    )

    fig.show(config={"responsive": True})
    export_visualize_tags(fig)

 # Visualisierung 3: Häufigkeit Index
def visualize_index(bib_database):
    index_terms = [
        'Lernsystemarchitektur',
        'Bildungstheorien',
        'Lehr- und Lerneffektivität',
        'Kollaboratives Lernen',
        'Bewertungsmethoden',
        'Technologieintegration',
        'Datenschutz und IT-Sicherheit',
        'Systemanpassung',
        'Krisenreaktion im Bildungsbereich',
        'Forschungsansätze'
    ]

    index_counts = defaultdict(int)
    for entry in bib_database.entries:
        if 'keywords' in entry:
            entry_keywords = list(map(str.lower, map(str.strip, entry['keywords'].replace('\\#', '#').split(','))))
            for index_term in index_terms:
                if index_term.lower() in entry_keywords:
                    index_counts[index_term] += 1

    index_data = [{'Index': index, 'Count': count} for index, count in index_counts.items()]
    index_data = sorted(index_data, key=lambda x: x['Count'], reverse=True)

    total_count = sum(index_counts.values())
    print(f"Häufigkeit Indizes (Gesamtanzahl: {total_count}):")
    print(tabulate(index_data, headers="keys", tablefmt="grid"))

    fig = px.bar(index_data, x='Index', y='Count', title=f'Relevanzschlüssel nach Indexkategorien (n={total_count}, Stand: {current_date})', labels={'Index': 'Index', 'Count': 'Anzahl der Vorkommen'}, text_auto=True)

    fig.update_layout(
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        font=dict(color=colors['white']),
        margin=dict(l=0, r=0, t=40, b=40),
        autosize=True
    )

    fig.update_traces(marker_color=colors['primaryLine'], marker_line_color=colors['white'], marker_line_width=1.5)

    fig.show(config={"responsive": True})
    export_visualize_index(fig)

 # Visualisierung 4: Häufigkeit Forschungsunterfragen
def visualize_research_questions(bib_database):
    research_questions = {
        'promotion:fu1': 'Akzeptanz und Nützlichkeit (FU1)',
        'promotion:fu2a': 'Effekt für Lernende (FU2a)',
        'promotion:fu2b': 'Effekt-Faktoren für Lehrende (FU2b)',
        'promotion:fu3': 'Konzeption und Merkmale (FU3)',
        'promotion:fu4a': 'Bildungswissenschaftliche Mechanismen (FU4a)',
        'promotion:fu4b': 'Technisch-gestalterische Mechanismen (FU4b)',
        'promotion:fu5': 'Möglichkeiten und Grenzen (FU5)',
        'promotion:fu6': 'Beurteilung als Kompetenzerwerbssystem (FU6)',
        'promotion:fu7': 'Inputs und Strategien (FU7)'
    }

    rq_counts = defaultdict(int)
    for entry in bib_database.entries:
        if 'keywords' in entry:
            entry_keywords = list(map(str.lower, map(str.strip, entry['keywords'].replace('\\#', '#').split(','))))
            for keyword in entry_keywords:
                if keyword in research_questions:
                    rq_counts[keyword] += 1

    rq_data = [{'Research_Question': research_questions[keyword], 'Count': count} for keyword, count in rq_counts.items()]
    rq_data = sorted(rq_data, key=lambda x: x['Count'], reverse=True)

    rq_data_df = pd.DataFrame(rq_data)

    total_count = rq_data_df['Count'].sum()
    print(f"Häufigkeit Forschungsunterfragen (Gesamtanzahl: {total_count}):")
    print(tabulate(rq_data, headers="keys", tablefmt="grid"))

    fig = px.bar(rq_data_df, x='Research_Question', y='Count', title=f'Zuordnung der Literatur zu Forschungsunterfragen (n={total_count}, Stand: {current_date})', labels={'Research_Question': 'Forschungsunterfrage', 'Count': 'Anzahl der Vorkommen'}, text_auto=True)

    fig.update_layout(
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        font=dict(color=colors['white']),
        margin=dict(l=0, r=0, t=40, b=40),
        autosize=True
    )

    fig.update_traces(marker_color=colors['primaryLine'], marker_line_color=colors['white'], marker_line_width=1.5)

    fig.show(config={"responsive": True})
    export_visualize_research_questions(fig)

 # Visualisierung 5: Häufigkeit spezifischer Kategorien
def visualize_categories(bib_database):
    categories = {
        'promotion:argumentation': 'Argumentation',
        'promotion:kerngedanke': 'Kerngedanke',
        'promotion:weiterführung': 'Weiterführung',
        'promotion:schlussfolgerung': 'Schlussfolgerung'
    }

    cat_counts = defaultdict(int)
    for entry in bib_database.entries:
        if 'keywords' in entry:
            entry_keywords = list(map(str.lower, map(str.strip, entry['keywords'].replace('\\#', '#').split(','))))
            for keyword in entry_keywords:
                if keyword in categories:
                    cat_counts[keyword] += 1

    cat_data = [{'Category': categories[keyword], 'Count': count} for keyword, count in cat_counts.items()]
    cat_data = sorted(cat_data, key=lambda x: x['Count'], reverse=True)

    cat_data_df = pd.DataFrame(cat_data)

    total_count = cat_data_df['Count'].sum()
    print(f"Häufigkeit Kategorien (Gesamtanzahl: {total_count}):")
    print(tabulate(cat_data, headers="keys", tablefmt="grid"))

    fig = px.bar(cat_data_df, x='Category', y='Count', title=f'Textsortenzuordnung der analysierten Quellen (n={total_count}, Stand: {current_date})', labels={'Category': 'Kategorie', 'Count': 'Anzahl der Vorkommen'}, text_auto=True)

    fig.update_layout(
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        font=dict(color=colors['white']),
        margin=dict(l=0, r=0, t=40, b=40),
        autosize=True
    )

    fig.update_traces(marker_color=colors['primaryLine'], marker_line_color=colors['white'], marker_line_width=1.5)

    fig.show(config={"responsive": True})
    export_visualize_categories(fig)

 # Zeitreihenanalyse der Veröffentlichungen
def extract_year_from_entry(entry):
    year_str = entry.get('year', '').strip()
    if not year_str:
        return None
    try:
        matches = re.findall(r'\b(19[0-9]{2}|20[0-9]{2})\b', year_str)
        years = [int(y) for y in matches if 1900 <= int(y) <= datetime.now().year + 1]
        return min(years) if years else None
    except Exception as e:
        print(f"⚠️ Fehler bei Jahreswert '{year_str}': {e}")
        return None

def visualize_time_series(bib_database):
    publication_years = []

    for entry in bib_database.entries:
        year = extract_year_from_entry(entry)
        if year is not None:
            publication_years.append(year)

    if publication_years:
        year_counts = Counter(publication_years)
        df = pd.DataFrame(year_counts.items(), columns=['Year', 'Count']).sort_values('Year')

        fig = px.line(
            df,
            x='Year',
            y='Count',
            title=f'Jährliche Veröffentlichungen in der Literaturanalyse (n={sum(year_counts.values())}, Stand: {current_date})',
            labels={'Year': 'Jahr', 'Count': 'Anzahl der Veröffentlichungen'}
        )

        fig.update_layout(
            plot_bgcolor=colors['background'],
            paper_bgcolor=colors['background'],
            font=dict(color=colors['white']),
            xaxis=dict(
                tickmode='linear',
                dtick=2,
                tick0=min(publication_years)
            ),
            margin=dict(l=0, r=0, t=40, b=40),
            autosize=True
        )

        fig.update_traces(line=dict(color=colors['secondaryLine'], width=3))
        fig.show(config={"responsive": True})
        export_visualize_time_series(fig)
    else:
        print("Keine gültigen Veröffentlichungsjahre gefunden.")

 # Top Autoren nach Anzahl der Werke
def visualize_top_authors(bib_database):
    top_n = 25  # Anzahl der Top-Autoren, die angezeigt werden sollen
    author_counts = defaultdict(int)
    for entry in bib_database.entries:
        if 'author' in entry and entry['author'].strip():
            authors = [a.strip() for a in entry['author'].split(' and ') if a.strip()]
            for author in authors:
                author_counts[author] += 1

    top_authors = Counter(author_counts).most_common(top_n)
    if top_authors:
        df = pd.DataFrame(top_authors, columns=['Author', 'Count'])

        fig = px.bar(df, x='Author', y='Count', title=f'Meistgenannte Autor:innen in der Literaturanalyse (Top {top_n}, n={sum(author_counts.values())}, Stand: {current_date})', labels={'Author': 'Autor', 'Count': 'Anzahl der Werke'}, text_auto=True)
        fig.update_layout(
            plot_bgcolor=colors['background'],
            paper_bgcolor=colors['background'],
            font=dict(color=colors['white']),
            margin=dict(l=0, r=0, t=40, b=40),
            autosize=True
        )
        fig.update_traces(marker_color=colors['primaryLine'], marker_line_color=colors['white'], marker_line_width=1.5)

        fig.show(config={"responsive": True})
        export_visualize_top_authors(fig)
    else:
        print("Keine Autoren gefunden.")


 # Top Titel nach Anzahl der Werke
def normalize_title(title):
    # Entfernen von Sonderzeichen und Standardisierung auf Kleinbuchstaben
    title = title.lower().translate(str.maketrans('', '', ",.!?\"'()[]{}:;"))
    # Zusammenführen ähnlicher Titel, die sich nur in geringfügigen Details unterscheiden
    title = " ".join(title.split())
    # Entfernen häufiger Füllwörter oder Standardphrasen, die die Unterscheidung nicht unterstützen
    common_phrases = ['eine studie', 'untersuchung der', 'analyse von']
    for phrase in common_phrases:
        title = title.replace(phrase, '')
    return title.strip()

def visualize_top_publications(bib_database):
    top_n = 25  # Anzahl der Top-Publikationen, die angezeigt werden sollen
    publication_counts = defaultdict(int)

    for entry in bib_database.entries:
        invalid_titles = {"pdf", "no title found", "published entry", "", None}
        title = normalize_title(entry.get('title', ''))
        if title.lower() not in invalid_titles and len(title) > 5:
            publication_counts[title] += 1

    top_publications = sorted(publication_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
    publication_data = [{'Title': title[:50] + '...' if len(title) > 50 else title, 'Count': count} for title, count in top_publications]

    df = pd.DataFrame(publication_data)

    fig = px.bar(df, x='Title', y='Count', title=f'Häufig zitierte Publikationen in der Analyse (Top {top_n}, n={sum(publication_counts.values())}, Stand: {current_date})', labels={'Title': 'Titel', 'Count': 'Anzahl der Nennungen'})

    fig.update_layout(
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        font=dict(color=colors['white']),
        xaxis_tickangle=-45,
        margin=dict(l=0, r=0, t=40, b=40),
        autosize=True
    )

    fig.update_traces(marker_color=colors['primaryLine'], marker_line_color=colors['white'], marker_line_width=1.5)

    fig.show(config={"responsive": True})
    export_visualize_top_publications(fig)


##########


# Daten vorbereiten
def prepare_path_data(bib_database):
    research_questions = {
        'promotion:fu1': 'Akzeptanz und Nützlichkeit (FU1)',
        'promotion:fu2a': 'Effekt für Lernende (FU2a)',
        'promotion:fu2b': 'Effekt-Faktoren für Lehrende (FU2b)',
        'promotion:fu3': 'Konzeption und Merkmale (FU3)',
        'promotion:fu4a': 'Bildungswissenschaftliche Mechanismen (FU4a)',
        'promotion:fu4b': 'Technisch-gestalterische Mechanismen (FU4b)',
        'promotion:fu5': 'Möglichkeiten und Grenzen (FU5)',
        'promotion:fu6': 'Beurteilung als Kompetenzerwerbssystem (FU6)',
        'promotion:fu7': 'Inputs und Strategien (FU7)'
    }

    categories = {
        'promotion:argumentation': 'Argumentation',
        'promotion:kerngedanke': 'Kerngedanke',
        'promotion:weiterführung': 'Weiterführung',
        'promotion:schlussfolgerung': 'Schlussfolgerung'
    }

    index_terms = [
        'Lernsystemarchitektur',
        'Bildungstheorien',
        'Lehr- und Lerneffektivität',
        'Kollaboratives Lernen',
        'Bewertungsmethoden',
        'Technologieintegration',
        'Datenschutz und IT-Sicherheit',
        'Systemanpassung',
        'Krisenreaktion im Bildungsbereich',
        'Forschungsansätze'
    ]

    entry_types = [
        'Zeitschriftenartikel',
        'Buch',
        'Buchteil',
        'Bericht',
        'Konferenz-Paper'
    ]

    data = []

    for entry in bib_database.entries:
        entry_data = {
            'FU': None,
            'Category': None,
            'Index': None,
            'Type': entry.get('ENTRYTYPE', '').lower()
        }

        if 'keywords' in entry:
            entry_keywords = list(map(str.lower, map(str.strip, entry['keywords'].replace('\\#', '#').split(','))))

            for key, value in research_questions.items():
                if key in entry_keywords:
                    entry_data['FU'] = value

            for key, value in categories.items():
                if key in entry_keywords:
                    entry_data['Category'] = value

            for index_term in index_terms:
                if index_term.lower() in entry_keywords:
                    entry_data['Index'] = index_term

        if all(value is not None for value in entry_data.values()):
            data.append(entry_data)

    return data

 # Pfaddiagramm erstellen
def create_path_diagram(data):
    labels = []
    sources = []
    targets = []
    values = []
    color_map = {
        'zeitschriftenartikel': colors['primaryLine'],
        'konferenz-paper': colors['secondaryLine'],
        'buch': colors['depthArea'],
        'buchteil': colors['brightArea'],
        'bericht': colors['accent']
    }

    def add_to_labels(label):
        if label not in labels:
            labels.append(label)
        return labels.index(label)

    for entry in data:
        fu_idx = add_to_labels(entry['FU'])
        category_idx = add_to_labels(entry['Category'])
        index_idx = add_to_labels(entry['Index'])
        type_idx = add_to_labels(entry['Type'])

        sources.extend([fu_idx, category_idx, index_idx])
        targets.extend([category_idx, index_idx, type_idx])
        values.extend([1, 1, 1])

    node_colors = [color_map.get(label, colors['primaryLine']) for label in labels]

    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color=colors['white'], width=0.5),
            label=labels,
            color=node_colors
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    )])

    fig.update_layout(
        title_text=f'Kategorischer Analysepfad der Literatur (n={len(data)}, Stand: {current_date})',
        font=dict(size=10, color=colors['white']),
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background']
    )

    fig.show()
    export_create_path_diagram(fig)


#############

def create_sankey_diagram(bib_database):
    def extract_year(entry):
        """
        Extrahiert ein gültiges Jahr aus dem `year`-Feld eines Eintrags.
        """
        year_str = entry.get('year', '').strip()
        try:
            # Suche nach einer 4-stelligen Jahreszahl
            year_match = re.search(r'\b\d{4}\b', year_str)
            if year_match:
                return int(year_match.group())
            else:
                raise ValueError(f"Kein gültiges Jahr gefunden: {year_str}")
        except ValueError:
            print(f"Warnung: Ungültiger Jahreswert in Eintrag übersprungen: {year_str}")
            return None

    current_year = datetime.now().year
    filtered_entries = [
        entry for entry in bib_database.entries
        if 'promotion:literaturanalyse' in entry.get('keywords', '').lower()
    ]

    initial_sources = len(filtered_entries)
    screened_sources = initial_sources  # Da bereits gefiltert
    quality_sources = sum(
        1 for entry in filtered_entries
        if entry.get('ENTRYTYPE') in ['article', 'phdthesis']
    )
    relevance_sources = sum(
        1 for entry in filtered_entries
        if entry.get('ENTRYTYPE') in ['article', 'phdthesis']
        and any(rq in entry.get('keywords', '').lower() for rq in ['promotion:fu3', 'promotion:kerngedanke'])
    )
    thematic_sources = sum(
        1 for entry in filtered_entries
        if entry.get('ENTRYTYPE') in ['article', 'phdthesis']
        and any(rq in entry.get('keywords', '').lower() for rq in ['promotion:fu3', 'promotion:kerngedanke'])
        and any(kw in entry.get('keywords', '').lower() for kw in ['digital', 'learning'])
    )
    recent_sources = sum(
        1 for entry in filtered_entries
        if entry.get('ENTRYTYPE') in ['article', 'phdthesis']
        and any(rq in entry.get('keywords', '').lower() for rq in ['promotion:fu3', 'promotion:kerngedanke'])
        and any(kw in entry.get('keywords', '').lower() for kw in ['digital', 'learning'])
        and (year := extract_year(entry)) and year >= current_year - 5
    )
    classic_sources = sum(
        1 for entry in filtered_entries
        if entry.get('ENTRYTYPE') in ['article', 'phdthesis']
        and any(rq in entry.get('keywords', '').lower() for rq in ['promotion:fu3', 'promotion:kerngedanke'])
        and any(kw in entry.get('keywords', '').lower() for kw in ['digital', 'learning'])
        and (year := extract_year(entry)) and year < current_year - 5
        and 'classic' in entry.get('keywords', '').lower()
    )
    selected_sources = recent_sources + classic_sources

    # Stichprobengröße berechnen
    sample_size = calculate_sample_size(initial_sources)

    # Phasen und Verbindungen definieren
    phases = [
        "Identifizierte Quellen",
        "Nach Screening (Literaturanalyse-Markierung)",
        "Nach Qualitätsprüfung (Artikel und Dissertationen)",
        "Nach Relevanzprüfung (FU3 und Kerngedanken)",
        "Nach thematischer Prüfung (Digital & Learning)",
        "Aktuelle Forschung (letzte 5 Jahre)",
        "Klassische Werke",
        "Ausgewählte Quellen (Endauswahl)"
    ]

    sources = [0, 1, 2, 3, 4, 4, 4]
    targets = [1, 2, 3, 4, 5, 6, 7]
    values = [
        screened_sources,
        quality_sources,
        relevance_sources,
        thematic_sources,
        recent_sources,
        classic_sources,
        selected_sources
    ]

    # Prozentsätze berechnen für die Labels
    percentages = [
        "100.0%",  # Startwert
        f"{screened_sources / initial_sources * 100:.1f}%",
        f"{quality_sources / screened_sources * 100:.1f}%" if screened_sources > 0 else "0.0%",
        f"{relevance_sources / quality_sources * 100:.1f}%" if quality_sources > 0 else "0.0%",
        f"{thematic_sources / relevance_sources * 100:.1f}%" if relevance_sources > 0 else "0.0%",
        f"{recent_sources / thematic_sources * 100:.1f}%" if thematic_sources > 0 else "0.0%",
        f"{classic_sources / thematic_sources * 100:.1f}%" if thematic_sources > 0 else "0.0%",
        f"{selected_sources / (recent_sources + classic_sources) * 100:.1f}%" if (recent_sources + classic_sources) > 0 else "0.0%"
    ]

    # Labels für Knoten anpassen, um Prozentsätze anzuzeigen
    node_labels = [f"{ph} ({pct})" for ph, pct in zip(phases, percentages)]

    # Farben für die einzelnen Phasen
    node_colors = [
        colors['primaryLine'],          # Identifizierte Quellen
        colors['secondaryLine'],        # Nach Screening
        colors['brightArea'],           # Nach Qualitätsprüfung
        colors['depthArea'],            # Nach Relevanzprüfung
        colors['positiveHighlight'],    # Nach thematischer Prüfung
        colors['negativeHighlight'],    # Aktuelle Forschung
        colors['accent'],               # Klassische Werke
        colors['positiveHighlight']     # Ausgewählte Quellen
    ]

    # Sankey-Diagramm erstellen
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=node_labels,
            color=node_colors
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
            hoverinfo='all',  # Zeigt detaillierte Infos bei Mouseover an
            color=colors['accent']
        )
    ))

    # Layout anpassen
    fig.update_layout(
        title_text=f"Flussdiagramm der Literaturselektion (Stichprobe: n={sample_size}, Stand: {current_date})",
        font_size=12,  # Größere Schriftgröße für bessere Lesbarkeit
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        font=dict(color=colors['white'])
    )

    fig.show()
    export_create_sankey_diagram(fig)

##########

def calculate_sample_size(N, Z=1.96, p=0.5, e=0.05):
    """
    Berechnet die Stichprobengröße basierend auf der Gesamtanzahl der Einträge (N).
    """
    if N <= 0:
        return 0
    n_0 = (Z**2 * p * (1 - p)) / (e**2)
    n = n_0 / (1 + ((n_0 - 1) / N))
    return math.ceil(n)

def visualize_sources_status(bib_database):
    """
    Visualisiert den Status der analysierten und nicht analysierten Quellen pro Suchordner.
    """
    search_folder_tags = [
        "#1:zeitschriftenartikel:learning:management:system",
        "#2:zeitschriftenartikel:online:lernplattform",
        "#3:zeitschriftenartikel:online:lernumgebung",
        "#4:zeitschriftenartikel:mooc",
        "#5:zeitschriftenartikel:e-learning",
        "#6:zeitschriftenartikel:bildung:technologie",
        "#7:zeitschriftenartikel:digital:medien",
        "#8:zeitschriftenartikel:blended:learning",
        "#9:zeitschriftenartikel:digital:lernen",
        "#a:zeitschriftenartikel:online:lernen",
        "#b:zeitschriftenartikel:online:learning",
        "#0:zeitschriftenartikel:digital:learning",
        "#1:konferenz-paper:learning:management:system",
        "#2:konferenz-paper:online:lernplattform",
        "#3:konferenz-paper:online:lernumgebung",
        "#4:konferenz-paper:mooc",
        "#5:konferenz-paper:e-learning",
        "#6:konferenz-paper:bildung:technologie",
        "#7:konferenz-paper:digital:medien",
        "#8:konferenz-paper:blended:learning",
        "#9:konferenz-paper:digital:lernen",
        "#a:konferenz-paper:online:lernen",
        "#b:konferenz-paper:online:learning",
        "#0:konferenz-paper:digital:learning"
    ]

    category_tags = {"promotion:argumentation", "promotion:kerngedanke", "promotion:weiterführung", "promotion:schlussfolgerung"}
    source_data = defaultdict(lambda: {'Identifiziert': 0, 'Analysiert': 0})

    if not bib_database or not bib_database.entries:
        print("Fehler: Die Datenbank enthält keine Einträge.")
        return

    for entry in bib_database.entries:
        keywords = entry.get('keywords', '')
        if not keywords:
            continue

        entry_keywords = set(map(str.lower, map(str.strip, keywords.replace('\\#', '#').split(','))))

        for tag in search_folder_tags:
            if tag.lower() in entry_keywords:
                source_data[tag]['Identifiziert'] += 1
                if entry_keywords & category_tags:
                    source_data[tag]['Analysiert'] += 1

    table_data = []
    analysiert_values = []
    nicht_analysiert_values = []
    analysiert_colors = []
    tags = []

    for tag, counts in sorted(source_data.items(), key=lambda item: item[1]['Identifiziert'], reverse=True):
        stichprobe = calculate_sample_size(counts['Identifiziert'])
        noch_zu_analysieren = counts['Identifiziert'] - counts['Analysiert']
        noch_benoetigt_fuer_stichprobe = max(0, stichprobe - counts['Analysiert'])

        table_data.append([
            tag,
            counts['Identifiziert'],
            counts['Analysiert'],
            noch_zu_analysieren,
            stichprobe,
            noch_benoetigt_fuer_stichprobe
        ])

        analysiert_values.append(counts['Analysiert'])
        nicht_analysiert_values.append(noch_zu_analysieren)
        tags.append(tag)

        analysiert_colors.append(colors['positiveHighlight'] if counts['Analysiert'] >= stichprobe else colors['negativeHighlight'])

    print(tabulate(
        table_data,
        headers=['Suchordner', 'Identifiziert', 'Analysiert', 'nicht-Analysiert', 'Stichprobe', 'Noch benötigt für Stichprobe'],
        tablefmt='grid'
    ))

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=tags,
        y=analysiert_values,
        name='Analysiert',
        marker=dict(color=analysiert_colors)
    ))

    fig.add_trace(go.Bar(
        x=tags,
        y=nicht_analysiert_values,
        name='Nicht-Analysiert',
        marker=dict(color=colors['primaryLine'])
    ))

    fig.update_layout(
        barmode='stack',
        title=f'Analyse- und Stichprobenstatus je Suchordner (n={sum(counts["Identifiziert"] for counts in source_data.values())}, Stand: {current_date})',
        xaxis_title='Suchbegriffsordner',
        yaxis_title='Anzahl der Quellen',
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        font=dict(color=colors['white']),
        xaxis=dict(
            categoryorder='array',
            categoryarray=search_folder_tags
        )
    )

    fig.show()
    export_visualize_sources_status(fig)

#############

# Funktion zur Erstellung einer Wortwolke aus Überschriften
def create_wordcloud_from_titles(bib_database, stop_words):
    titles = [entry.get('title', '') for entry in bib_database.entries]

    # Wörter zählen
    word_counts = defaultdict(int)
    for title in titles:
        for word in title.split():
            word = word.lower().strip(",.!?\"'()[]{}:;")
            if word and word not in stop_words:
                word_counts[word] += 1

    # Wortwolke erstellen
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color=colors['background'],
        color_func=lambda *args, **kwargs: random.choice(word_colors)
    ).generate_from_frequencies(word_counts)

    # Wortwolke anzeigen
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Häufigkeitsanalyse von Titelwörtern (Stand: {current_date})', color=colors['white'])
    plt.show()

# Exportfunktion für visualize_search_term_distribution
def export_visualize_search_term_distribution(fig):
    if export_fig_visualize_search_term_distribution:
        safe_filename = slugify("visualize_search_term_distribution")
        export_path = f"{safe_filename}.html"
        fig.write_html(export_path, full_html=True, include_plotlyjs="cdn")
        remote_path = "jochen-hanisch@sternenflottenakademie.local:/mnt/deep-space-nine/public/plot/promotion/"
        try:
            subprocess.run(["scp", export_path, remote_path], check=True, capture_output=True, text=True)
            print(f"✅ Datei '{export_path}' erfolgreich übertragen.")
        except subprocess.CalledProcessError as e:
            print("❌ Fehler beim Übertragen:")
            print(e.stderr)

# Kuchengrafik zur Verteilung der Einträge auf primäre, sekundäre und tertiäre Begriffsordner
def visualize_search_term_distribution(bib_database):
    """
    Erstellt eine Kuchengrafik zur Verteilung der Einträge auf primäre, sekundäre und tertiäre Begriffsordner.
    """
    hierarchy_counts = {
        'Primär': 0,
        'Sekundär': 0,
        'Tertiär': 0
    }

    primary_folders = {
        'S:01 Learning Management System',
        'S:02 Online-Lernplattform',
        'S:03 Online-Lernumgebung',
        'S:05 eLearning',
        'S:04 MOOC',
        'S:06 Bildungstechnologie',
        'S:07 Digitale Medien',
        'S:08 Blended Learning',
        'S:09 Digitales Lernen',
        'S:12 Digital Learning',
        'S:10 Online Lernen',
        'S:11 Online Learning'
    }

    secondary_folders = {
        'S:13 Berichte',
        'S:14 Agiles Lernen',
        'S:15 Learning Analytics'
    }

    tertiary_folders = {
        'S:16 Dissertationen',
        'S:17 ePortfolio'
    }

    conn = sqlite3.connect('/Users/jochen_hanisch-johannsen/Zotero/zotero.sqlite')
    cursor = conn.cursor()

    query = """
    SELECT collections.collectionName, COUNT(DISTINCT items.itemID)
    FROM items
    JOIN collectionItems ON items.itemID = collectionItems.itemID
    JOIN collections ON collectionItems.collectionID = collections.collectionID
    WHERE collections.collectionName IN (
        'S:01 Learning Management System',
        'S:02 Online-Lernplattform',
        'S:03 Online-Lernumgebung',
        'S:05 eLearning',
        'S:04 MOOC',
        'S:06 Bildungstechnologie',
        'S:07 Digitale Medien',
        'S:08 Blended Learning',
        'S:09 Digitales Lernen',
        'S:12 Digital Learning',
        'S:10 Online Lernen',
        'S:11 Online Learning',
        'S:13 Berichte',
        'S:14 Agiles Lernen',
        'S:15 Learning Analytics',
        'S:16 Dissertationen',
        'S:17 ePortfolio'
    )
    GROUP BY collections.collectionName
    """

    cursor.execute(query)
    rows = cursor.fetchall()
    conn.close()

    for collection, count in rows:
        if collection in primary_folders:
            hierarchy_counts['Primär'] += count
        elif collection in secondary_folders:
            hierarchy_counts['Sekundär'] += count
        elif collection in tertiary_folders:
            hierarchy_counts['Tertiär'] += count

    labels = list(hierarchy_counts.keys())
    values = list(hierarchy_counts.values())
    colors_pie = [colors['primaryLine'], colors['secondaryLine'], colors['brightArea']]

    fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        marker=dict(colors=colors_pie),
        textinfo='label+percent',
        hoverinfo='label+value'
    )])

    fig.update_layout(
        title='Verteilung der Suchbegriffsordner (Primär, Sekundär, Tertiär)',
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        font=dict(color=colors['white'])
    )

    fig.show()
    export_visualize_search_term_distribution(fig)

# Aufrufen der Visualisierungsfunktionen
visualize_network(bib_database)
visualize_tags(bib_database)
visualize_index(bib_database)
visualize_research_questions(bib_database)
visualize_categories(bib_database)
visualize_time_series(bib_database)
visualize_top_authors(bib_database)
visualize_top_publications(bib_database)
data = prepare_path_data(bib_database)
create_path_diagram(data)
create_sankey_diagram(bib_database)
visualize_sources_status(bib_database)
create_wordcloud_from_titles(bib_database, stop_words)
visualize_search_term_distribution(bib_database)