import pandas as pd import pandasql as ps from datetime import datetime import seaborn daily_engagements = pd.read_csv('./data/daily_engagement.csv') enrollments = pd.read_csv('./data/enrollments.csv')
sqldf
function of the sqldf
module is used, and the dictionary of local names locals()
must also be passed to this function (for more information on using the functions of locals()
and globals()
in Pandasql, you can read on Stackoverflow ). # pandas code top10_engagements_pandas = daily_engagements[['acct', 'total_minutes_visited', 'utc_date']] .sort('total_minutes_visited', ascending = False)[:10] # pandasql code simple_query = ''' SELECT acct, total_minutes_visited, utc_date FROM daily_engagements ORDER BY total_minutes_visited desc LIMIT 10 ''' top10_engagements_pandas = ps.sqldf(simple_query, locals())
daily_engagements['weekday'] = map(lambda x: datetime.strptime(x, '%Y-%m-%d').strftime('%A'), daily_engagements.utc_date) daily_engagements.head()
# pandas code weekday_engagement_pandas = pd.DataFrame(daily_engagements.groupby('weekday').total_minutes_visited.mean()) # pandasql code aggr_query = ''' SELECT avg(total_minutes_visited) as total_minutes_visited, weekday FROM daily_engagements GROUP BY weekday ''' weekday_engagement_pandasql = ps.sqldf(aggr_query, locals()).set_index('weekday') week_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] weekday_engagement_pandasql.loc[week_order].plot(kind = 'bar', rot = 45, title = 'Total time spent on Udacity by weekday')
is_udacity = 0
; # pandas code join_df = pd.merge(daily_engagements, enrollments[enrollments.is_udacity == 0], how = 'inner', right_on ='account_key', left_on = 'acct') join_df = join_df[['account_key', 'status', 'total_minutes_visited', 'utc_date', 'join_date', 'cancel_date']] join_df['days_since_joining'] = map(lambda x: x.days, pd.to_datetime(join_df.utc_date) - pd.to_datetime(join_df.join_date)) join_df['before_cancel'] = (pd.to_datetime(join_df.utc_date) <= pd.to_datetime(join_df.cancel_date)) join_df = join_df[join_df.before_cancel | (join_df.status == 'current')] join_df = join_df[(join_df.days_since_joining < 7) & (join_df.days_since_joining >= 0)] avg_account_total_minutes = pd.DataFrame(join_df.groupby(['account_key', 'status'], as_index = False) .total_minutes_visited.mean()) avg_engagement_pandas = pd.DataFrame(avg_account_total_minutes.groupby('status').total_minutes_visited.mean()) avg_engagement_pandas.columns = [] # pandasql code join_query = ''' SELECT avg(avg_acct_total_minutes) as avg_total_minutes, status FROM (SELECT avg(total_minutes_visited) as avg_acct_total_minutes, status, account_key FROM (SELECT e.account_key, e.status, de.total_minutes_visited, (cast(strftime('%s',de.utc_date) as interger) - cast(strftime('%s',e.join_date) as interger))/(24*60*60) as days_since_joining, (cast(strftime('%s',e.cancel_date) as interger) - cast(strftime('%s', de.utc_date) as interger))/(24*60*60) as days_before_cancel FROM enrollments as e JOIN daily_engagements as de ON (e.account_key = de.acct) WHERE (is_udacity = 0) AND (days_since_joining < 7) AND (days_since_joining >= 0) AND ((days_before_cancel >= 0) OR (status = 'current')) ) GROUP BY status, account_key) GROUP BY status ''' avg_engagement_pandasql = ps.sqldf(join_query, locals()).set_index('status')
cast
and strftime
functions were used to bring the dates from the rows into the timestamp (the number of seconds since the beginning of the epoch), and then calculate the difference between these dates in days.Source: https://habr.com/ru/post/279213/
All Articles