Downloading user data


# Imports
import pylast as pylast
import thread
import time
import os.path
# Initialize the connection to the network through the API
print "Connecting to network..."
API_KEY = "abc123"; # this is a sample key
API_SECRET = "hello world"; # this is a sample API secret (obtain one from last.fm)
USERNAME = "jessepaleg";
PASSWORD = pylast.md5("jesseboyd");
NETWORK = pylast.LastFMNetwork(api_key = API_KEY, api_secret = API_SECRET, username = USERNAME, password_hash = PASSWORD);
MAX_USERS = 50000;
USER = "john"

# Sometimes last.fm returns corrupt data or null through the API methods. This makes the code in get_data simpler
def toString(string):
    try:
        return str(string);
    except:
        return "None";

# get_data will download user data to a location in a breadth-first manor.
def get_data(string,a):
    global myfriends
    user = NETWORK.get_user(string);
    myfile = file("data/"+USER+"/"+str(user)+".txt", "w+")
    friends2 = user.get_friends();
    friends = [];
    for i in friends2:
        friends+=[toString(i)];
    mytracks = user.get_top_tracks()
    tracks = [];
    for i in mytracks:
        tracks+=[str(i.item)];
    artistdata = user.get_top_artists();
    artists = []
    weights = []
    for j in artistdata:
        artists += [str(j.item)]
        weights += [str(j.weight)]
    myfile.write("user: "+toString(user));
    myfile.write("\ntracks: "+",".join(tracks));
    myfile.write("\nfriends: "+",".join(friends));
    myfile.write("\nage: "+toString(user.get_age()));
    myfile.write("\ncountry: "+toString(user.get_country()));
    myfile.write("\ngender: "+toString(user.get_gender()));
    myfile.write("\nlanguage: "+toString(user.get_language()));
    myfile.write("\nArtists: "+",".join(artists))
    myfile.write("\nWeights: "+",".join(weights))
    # Closing the file and returning the list of friends for the user.

    myfile.close();
    if (len(myfriends)<50000):
        myfriends+=friends
    return friends;

if not os.path.exists("data/"+USER+"/"):
    os.makedirs("data/"+USER+"/")

# Start at user "john" because we love john
global myfriends
myfriends = []
get_data(USER,0);

print "Beginning download..."
for i in xrange(MAX_USERS):
    try:
        # It's fast enough to check the hard-drive rather than store a separate list containing visited users in memory.
        # The bottleneck is always going to be the network, rather than read/write speads.
        if os.path.isfile("data/"+USER+"/"+myfriends[0]+".txt")==False:
            print myfriends[0]
            thread.start_new_thread( get_data, (myfriends[0],0));
            myfriends.pop(0);

            # sometimes the threads fail due to corrupt data or some other exception which you can't always catch in a try/except
            # It's simpler to have a delay between opening threads than having to keep track of number of open threads.
            time.sleep(1);
        else:
            myfriends.pop(0);
    except Exception,e:
        break;
print "finalizing..."
# Give sufficient time for remaining threads to finish
time.sleep(25);
print "Starting comparison";
execfile("compare.py")

Analysing data


from os import listdir
import operator
from datetime import datetime
import json

# SETP 1: Loading files into memory.

USER = "john"
__author__ = 'Jesse Boyd'
USERSTXT = listdir("data/"+USER)
LINES = []

# Working out how often (INTERVAL) to give the percentage (if there are less than 200 files, there is no point giving the progress since it's basically instant)
LENGTH = len(USERSTXT);
if (LENGTH>200): INTERVAL = (LENGTH/100)+1;
else: INTERVAL = 500;
last = 1

# The time estimates are less accurate than windows file copy. May as well over-estimate, that way you are not disappointed when it is faster (yet still horribly slow).
print "Found "+str(LENGTH)+" files.\nLoading files into memory, this will take around "+str(LENGTH*0.01)+"s."
for i in xrange(LENGTH):
    # Printing the progress at interval
    if (i>last*INTERVAL):
        print str(last)+"%"
        last+=1
        
    # Reading the data from the file
    user = USERSTXT[i]
    toadd = open("data/"+USER+"/"+user).readlines()
    # We are sifting through this data and turning it into a list so python can compare things quicker #
    default = ["user: ","tracks: ","friends: ","age: 0","country: ","gender: ","language: ","Artists: ","Weights: ",0,0]
    # Here we are adding default values in the odd chance the file didn't download fully
    for i in xrange(11):
        if (len(toadd)<=i):
            toadd+=[default[i]]

    # Adding the username to the list
    toadd[0] = toadd[0].strip("\n")[6:]
    # Adding the users tracks to the list
    toadd[1] = toadd[1].strip("\n")[8:].split(",")
    if toadd[1] == ['']: toadd[1] = [] # making sure it's not empty
    # Adding friends to the list
    toadd[2] = toadd[2].strip("\n")[9:].split(",")
    # Adding age (unused)
    toadd[3] = int(toadd[3].strip("\n")[5:])
    # Adding country
    toadd[4] = toadd[4].strip("\n")[9:]
    # Adding gender (unused)
    toadd[5] = toadd[5].strip("\n")[8:]
    # Adding language (unused)
    toadd[6] = toadd[6].strip("\n")[10:]
    # Adding artists
    toadd[7] = set(toadd[7].strip("\n")[9:].split(","))
    if (toadd[7]==set([''])): toadd[7] = set([])
    # Adding the weights for each artist
    toadd[8] = toadd[8].strip("\n")[9:].split(",");
    # Adding the sum of the weights
    if (toadd[8]!=['']): toadd[8] = map(int,toadd[8]);toadd[9]=sum(toadd[8])
    # Adding a dictionary of the artists and weights
    toadd[10]= dict(zip(toadd[7], toadd[8]))
    # Finishing by adding the list of lists to the main list
    mylines=[[toadd[0]],[toadd[1]],[toadd[2]],[toadd[3]],[toadd[4]],[toadd[5]],[toadd[6]],[toadd[7]],[toadd[8]],[toadd[9]],toadd[10]];
    LINES+=[mylines]
def recommend_track(INDEX1,INDEX2):
    TRACKS1 = LINES[INDEX1][1][0]
    TRACKS2 = LINES[INDEX2][1][0]
    for i in TRACKS2:
        if i not in TRACKS1:
            return i
    return False
    
def get_similarity(INDEX1,INDEX2):
    # A similarity of 100 indicates pure similarity.
    # Users lacking information such as artists, or tracks will not be 100% similar with themselves
    similarity = 0.0;
    DATA1 = LINES[INDEX1]
    DATA2 = LINES[INDEX2]
    #country - weight of 2
    if DATA1[4][0] not in ["None",""]:
        if (DATA1[4][0]==DATA2[4][0]): similarity+=2
    #Likes - weight of 40
    try:
        SET1 = set(DATA1[1][0])
        SET2 = set(DATA2[1][0])
        similarity += 40*len(SET1 & SET2)/float(max(len(SET1),len(SET2)));
    except: None
    # Artists - weight of 100
    ARTISTS1 = DATA1[7][0]
    ARTISTS2 = DATA2[7][0]
    if ((len(ARTISTS1)>0) and (len(ARTISTS2)>0)):
        SUM = DATA1[9][0]+DATA2[9][0]
        SET = set(DATA1[10].keys()) & set(DATA2[10].keys())
        SHARED = 0
        for i in SET:
            SHARED += min(DATA1[10][i],DATA2[10][i])    
        # The weight of the shared artists divided by the weight of all artists
        similarity+=100*SHARED/float(SUM)
    # friendship - weight of 8
    SET1 = set(DATA1[2][0])
    SET2 = set(DATA2[2][0])
    # Making sure friends isn't an empty set
    if DATA2[0][0] in SET1 and SET1!=set([""]):
        similarity+=4
    return similarity
print "Successfully loaded. Starting comparison"
for USER in ["jessepaleg.txt"]:
    try:
        INDEX = USERSTXT.index(USER)
        VALUES = {}
        START = datetime.now()
        for i in xrange(len(USERSTXT)):
            if i!=INDEX:
                try:
                    SIMILARITY = get_similarity(INDEX,i)
                    if SIMILARITY>0: VALUES[USERSTXT[i]] = SIMILARITY
                except Exception, e2: None
        links = sorted(VALUES.iteritems(), key=operator.itemgetter(1))[-15:]
        nodes = [(USER,'circle','red')]
        names = [USER]
        track = False
        recUSER = False
        for i in reversed(links):
            if ((i[0].split(".txt")[0] in LINES[INDEX][2][0]) == False):
                recUSER = i[0]
                break
        for i in reversed(links):
            if (track==False):
                track = recommend_track(INDEX,USERSTXT.index(i[0]))
            if (track!=False):
                links.insert(0,(USER,track,"Recommended Track",8,"green"))
                break
        for i in xrange(len(links)):
            if (len(links[i])<3):
                if (recUSER==links[i][0]):
                    links[i] = (USER,links[i][0],links[i][1],links[i][1],"green")
                    nodes+=[(links[i][1],'circle',"green")]
                else:
                    links[i] = (USER,links[i][0],links[i][1],links[i][1],"blue")
                    nodes+=[(links[i][1],'circle',"blue")]
            else:
                nodes+=[(links[i][1],'square',"green")]
            names+=[links[i][1]]
        jnodes = []
        jlinks = []
        for i in (xrange(len(nodes))):
            name = nodes[i][0];
            shape = nodes[i][1];
            color = nodes[i][2];
            jnodes.append({'id':name,'shape':shape,'color':color})
        for i in (links):
            jlinks.append({'source': names.index(i[0]),'target': names.index(i[1]), 'label':i[2],'width':i[3],'color':i[4]})
        with open(USER+".json", "w") as outfile:
            json.dump({"directed":False,"multigraph":False,"graph":[["__costumes__",{}]],'nodes':jnodes,'links':jlinks}, outfile, indent=4)
        print ((datetime.now()-START))
    except Exception,error:
        print error," for user: ",USER
input((datetime.now()-START))