Comprehensive Spotify Analysis Program

Code


import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import geoip2.database
import folium
from folium.plugins import MarkerCluster
from glob import glob

# Function to load and combine all JSON files
def load_spotify_data(data_directory):
    json_files = glob(os.path.join(data_directory, '*.json'))
    all_data = []

    for file in json_files:
        try:
            with open(file, 'r', encoding='utf-8', errors='ignore') as f:
                data = json.load(f)
                all_data.extend(data)
        except UnicodeDecodeError as e:
            print(f"Error reading {file}: {e}")

    return all_data

# Function to convert JSON data to pandas DataFrame
def json_to_dataframe(data):
    return pd.DataFrame(data)

# Function to show basic statistics of your listening history
def basic_statistics(df):
    total_streams = len(df)
    total_time_ms = df['ms_played'].sum()
    total_time_minutes = total_time_ms / (1000 * 60)  # Convert to minutes
    most_played_track = df.groupby('master_metadata_track_name')['ms_played'].sum().idxmax()
    most_played_artist = df.groupby('master_metadata_album_artist_name')['ms_played'].sum().idxmax()
    platform_usage = df['platform'].value_counts()

    print(f"Total streams: {total_streams}")
    print(f"Total time played (minutes): {total_time_minutes:.2f}")
    print(f"Most played track: {most_played_track}")
    print(f"Most played artist: {most_played_artist}")
    print("\nPlatform usage:\n", platform_usage)

# Function to plot streaming activity over time on a monthly basis (in minutes)
def plot_streaming_by_month(df):
    df['ts'] = pd.to_datetime(df['ts'], errors='coerce')
    df = df.sort_values(by='ts')
    df_grouped = df.groupby(pd.Grouper(key='ts', freq='M'))['ms_played'].sum() / (1000 * 60)  # Convert to minutes
    df_grouped = df_grouped.asfreq('M', fill_value=0)

    plt.figure(figsize=(10, 6))
    df_grouped.plot(kind='line')
    plt.title('Streaming Activity Over Time (Monthly Basis, Minutes Played)')
    plt.xlabel('Month')
    plt.ylabel('Total Minutes Played')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Function to plot top N most played artists (in minutes)
def plot_top_artists_in_minutes(df, n=10):
    top_artists = df.groupby('master_metadata_album_artist_name')['ms_played'].sum().sort_values(ascending=False).head(n) / (1000 * 60)
    plt.figure(figsize=(10, 6))
    top_artists.plot(kind='bar')
    plt.title(f'Top {n} Most Played Artists (Minutes Played)')
    plt.xlabel('Artist')
    plt.ylabel('Total Minutes Played')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Function to plot top N most played tracks (in minutes)
def plot_top_tracks_in_minutes(df, n=10):
    top_tracks = df.groupby('master_metadata_track_name')['ms_played'].sum().sort_values(ascending=False).head(n) / (1000 * 60)
    plt.figure(figsize=(10, 6))
    top_tracks.plot(kind='bar')
    plt.title(f'Top {n} Most Played Tracks (Minutes Played)')
    plt.xlabel('Track')
    plt.ylabel('Total Minutes Played')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Load GeoLite2 database
def load_geoip_database(db_path):
    return geoip2.database.Reader(db_path)

# Function to geolocate IP addresses
def geolocate_ips(df, geoip_reader):
    location_data = []
    
    for ip in df['ip_addr_decrypted']:
        try:
            response = geoip_reader.city(ip)
            location_data.append({
                'ip': ip,
                'country': response.country.name,
                'city': response.city.name,
                'latitude': response.location.latitude,
                'longitude': response.location.longitude
            })
        except Exception as e:
            location_data.append({
                'ip': ip,
                'country': None,
                'city': None,
                'latitude': None,
                'longitude': None
            })
    
    return pd.DataFrame(location_data)

# Function to create both a full map and a clustered map without point limitation
def create_full_and_clustered_maps(df, map_width=800, map_height=600, zoom_start=2):
    # Full Map - Display all points without clustering
    full_map = folium.Map(location=[20, 0], zoom_start=zoom_start, width=map_width, height=map_height)

    for _, row in df.iterrows():
        if not pd.isnull(row['latitude']) and not pd.isnull(row['longitude']):
            folium.Marker(
                location=[row['latitude'], row['longitude']],
                popup=f"City: {row['city']}, Country: {row['country']}"
            ).add_to(full_map)

    # Save the full map
    full_map.save("spotify_streams_full_map.html")
    print(f"Full map saved as spotify_streams_full_map.html")

    # Clustered Map - Use marker clustering
    clustered_map = folium.Map(location=[20, 0], zoom_start=zoom_start, width=map_width, height=map_height)
    marker_cluster = MarkerCluster().add_to(clustered_map)

    for _, row in df.iterrows():
        if not pd.isnull(row['latitude']) and not pd.isnull(row['longitude']):
            folium.Marker(
                location=[row['latitude'], row['longitude']],
                popup=f"City: {row['city']}, Country: {row['country']}"
            ).add_to(marker_cluster)

    # Save the clustered map
    clustered_map.save("spotify_streams_clustered_map.html")
    print(f"Clustered map saved as spotify_streams_clustered_map.html")

# Main execution
if __name__ == "__main__":
    # Directory containing your Spotify JSON files
    data_directory = r'C:\Users\eshaa\Downloads\Spotify Extended Streaming History'  # Adjust this path
    
    # Path to the GeoLite2 database
    geoip_db_path = r'C:\Users\eshaa\Downloads\GeoLite2-City_20241001\GeoLite2-City_20241001\GeoLite2-City.mmdb'  # Adjust this path

    # Load and process the data
    spotify_data = load_spotify_data(data_directory)
    if spotify_data:
        spotify_df = json_to_dataframe(spotify_data)

        # Show basic statistics
        print("\n--- Basic Statistics ---")
        basic_statistics(spotify_df)

        # Plot monthly streaming activity
        print("\n--- Streaming Activity Over Time ---")
        plot_streaming_by_month(spotify_df)

        # Plot top artists and tracks
        print("\n--- Top 10 Most Played Artists ---")
        plot_top_artists_in_minutes(spotify_df, n=10)
        
        print("\n--- Top 10 Most Played Tracks ---")
        plot_top_tracks_in_minutes(spotify_df, n=10)

        # Load GeoLite2 database for IP geolocation
        geoip_reader = load_geoip_database(geoip_db_path)
        
        # Geolocate IP addresses
        location_df = geolocate_ips(spotify_df, geoip_reader)
        print(location_df.head())  # Show geolocated IPs

        # Generate both full map and clustered map without point limitation
        create_full_and_clustered_maps(location_df, map_width=600, map_height=400, zoom_start=2)
    else:
        print("No data found or could not load data.")