Code
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import geoip2.database
import folium
from folium.plugins import MarkerCluster
from glob import glob
# Function to load and combine all JSON files
def load_spotify_data(data_directory):
json_files = glob(os.path.join(data_directory, '*.json'))
all_data = []
for file in json_files:
try:
with open(file, 'r', encoding='utf-8', errors='ignore') as f:
data = json.load(f)
all_data.extend(data)
except UnicodeDecodeError as e:
print(f"Error reading {file}: {e}")
return all_data
# Function to convert JSON data to pandas DataFrame
def json_to_dataframe(data):
return pd.DataFrame(data)
# Function to show basic statistics of your listening history
def basic_statistics(df):
total_streams = len(df)
total_time_ms = df['ms_played'].sum()
total_time_minutes = total_time_ms / (1000 * 60) # Convert to minutes
most_played_track = df.groupby('master_metadata_track_name')['ms_played'].sum().idxmax()
most_played_artist = df.groupby('master_metadata_album_artist_name')['ms_played'].sum().idxmax()
platform_usage = df['platform'].value_counts()
print(f"Total streams: {total_streams}")
print(f"Total time played (minutes): {total_time_minutes:.2f}")
print(f"Most played track: {most_played_track}")
print(f"Most played artist: {most_played_artist}")
print("\nPlatform usage:\n", platform_usage)
# Function to plot streaming activity over time on a monthly basis (in minutes)
def plot_streaming_by_month(df):
df['ts'] = pd.to_datetime(df['ts'], errors='coerce')
df = df.sort_values(by='ts')
df_grouped = df.groupby(pd.Grouper(key='ts', freq='M'))['ms_played'].sum() / (1000 * 60) # Convert to minutes
df_grouped = df_grouped.asfreq('M', fill_value=0)
plt.figure(figsize=(10, 6))
df_grouped.plot(kind='line')
plt.title('Streaming Activity Over Time (Monthly Basis, Minutes Played)')
plt.xlabel('Month')
plt.ylabel('Total Minutes Played')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Function to plot top N most played artists (in minutes)
def plot_top_artists_in_minutes(df, n=10):
top_artists = df.groupby('master_metadata_album_artist_name')['ms_played'].sum().sort_values(ascending=False).head(n) / (1000 * 60)
plt.figure(figsize=(10, 6))
top_artists.plot(kind='bar')
plt.title(f'Top {n} Most Played Artists (Minutes Played)')
plt.xlabel('Artist')
plt.ylabel('Total Minutes Played')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Function to plot top N most played tracks (in minutes)
def plot_top_tracks_in_minutes(df, n=10):
top_tracks = df.groupby('master_metadata_track_name')['ms_played'].sum().sort_values(ascending=False).head(n) / (1000 * 60)
plt.figure(figsize=(10, 6))
top_tracks.plot(kind='bar')
plt.title(f'Top {n} Most Played Tracks (Minutes Played)')
plt.xlabel('Track')
plt.ylabel('Total Minutes Played')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Load GeoLite2 database
def load_geoip_database(db_path):
return geoip2.database.Reader(db_path)
# Function to geolocate IP addresses
def geolocate_ips(df, geoip_reader):
location_data = []
for ip in df['ip_addr_decrypted']:
try:
response = geoip_reader.city(ip)
location_data.append({
'ip': ip,
'country': response.country.name,
'city': response.city.name,
'latitude': response.location.latitude,
'longitude': response.location.longitude
})
except Exception as e:
location_data.append({
'ip': ip,
'country': None,
'city': None,
'latitude': None,
'longitude': None
})
return pd.DataFrame(location_data)
# Function to create both a full map and a clustered map without point limitation
def create_full_and_clustered_maps(df, map_width=800, map_height=600, zoom_start=2):
# Full Map - Display all points without clustering
full_map = folium.Map(location=[20, 0], zoom_start=zoom_start, width=map_width, height=map_height)
for _, row in df.iterrows():
if not pd.isnull(row['latitude']) and not pd.isnull(row['longitude']):
folium.Marker(
location=[row['latitude'], row['longitude']],
popup=f"City: {row['city']}, Country: {row['country']}"
).add_to(full_map)
# Save the full map
full_map.save("spotify_streams_full_map.html")
print(f"Full map saved as spotify_streams_full_map.html")
# Clustered Map - Use marker clustering
clustered_map = folium.Map(location=[20, 0], zoom_start=zoom_start, width=map_width, height=map_height)
marker_cluster = MarkerCluster().add_to(clustered_map)
for _, row in df.iterrows():
if not pd.isnull(row['latitude']) and not pd.isnull(row['longitude']):
folium.Marker(
location=[row['latitude'], row['longitude']],
popup=f"City: {row['city']}, Country: {row['country']}"
).add_to(marker_cluster)
# Save the clustered map
clustered_map.save("spotify_streams_clustered_map.html")
print(f"Clustered map saved as spotify_streams_clustered_map.html")
# Main execution
if __name__ == "__main__":
# Directory containing your Spotify JSON files
data_directory = r'C:\Users\eshaa\Downloads\Spotify Extended Streaming History' # Adjust this path
# Path to the GeoLite2 database
geoip_db_path = r'C:\Users\eshaa\Downloads\GeoLite2-City_20241001\GeoLite2-City_20241001\GeoLite2-City.mmdb' # Adjust this path
# Load and process the data
spotify_data = load_spotify_data(data_directory)
if spotify_data:
spotify_df = json_to_dataframe(spotify_data)
# Show basic statistics
print("\n--- Basic Statistics ---")
basic_statistics(spotify_df)
# Plot monthly streaming activity
print("\n--- Streaming Activity Over Time ---")
plot_streaming_by_month(spotify_df)
# Plot top artists and tracks
print("\n--- Top 10 Most Played Artists ---")
plot_top_artists_in_minutes(spotify_df, n=10)
print("\n--- Top 10 Most Played Tracks ---")
plot_top_tracks_in_minutes(spotify_df, n=10)
# Load GeoLite2 database for IP geolocation
geoip_reader = load_geoip_database(geoip_db_path)
# Geolocate IP addresses
location_df = geolocate_ips(spotify_df, geoip_reader)
print(location_df.head()) # Show geolocated IPs
# Generate both full map and clustered map without point limitation
create_full_and_clustered_maps(location_df, map_width=600, map_height=400, zoom_start=2)
else:
print("No data found or could not load data.")