my_utils.py

# coding: utf-8

# my_utils for Intro to Data Science with Python
# Author: Kat Chuang
# Created: Nov 2014

# --------------------------------------
## Stage 2 begin

import csv 

# open a file and return a double list
def open_with_csv(filename, d='\t'):
    uuids = []
    with open(filename, encoding='utf-8') as tsvin:
        tsvin = csv.reader(tsvin, delimiter=d)
        for row in tsvin:
            uuids.append(row)
    return uuids

#2.a size
def number_of_records(data_sample):
    return len(data_sample)


#2.b calculate sum
def calculate_sum(data_sample):
	total = 0
	for row in data_sample[1:]:
	    price = float(row[2])
	    total += price
	return total

#2.c 1 # Find the average price
def find_average(data_sample, headers=False):
    total = calculate_sum(data_sample)
    size = number_of_records(data_sample)
    if headers:
        total -= 1

    average = total / size
    
    # return average   
    
    return '{:03.2f}'.format(average) #nicely formatted to 2 decimals

#2.d. Max, Min
# Find the maximum price
def find_max(theData, col):
    tempList = []
    
    for row in theData:
        price = float(row[col])
        tempList.append(price)
    return max(tempList)

# Find the minimum price
def find_min(theData, col):
    tempList = []

    for row in theData:
        price = float(row[col])
        tempList.append(price)
    return min(tempList)

# Find the max or min price
def find_max_min(the_data, col, m):
    tempList = [], val = 0

    for row in the_data:
        price = float(row[col])
        tempList.append(price)

        if m == "max": 
            val = max(tempList)
        elif m == "min":
            val = min(tempList)
        else: # hopefully we don’t come to this
            pass 

    return val


## Stage 2 end

# --------------------------------------

## Stage 3 begin
# Stage 3: Cleaning data

def create_bool_field_from_search_term(data_sample, search_term):
    new_array = []
    new_array.append(data_sample[0].append("cashmere"))

    for row in data_sample[1:]:
        new_bool_field=False
        if search_term in row[7]:
            new_bool_field=True
        
        row.append(new_bool_field)
        new_array.append(row)

    return new_array 

#filter by boolean
def filter_col_by_bool(the_data, col):
    filtered_rows = []
    
    for row in the_data[1:]:
        if row[col]:
            filtered_rows.append(row)
            
    return filtered_rows

# Filter rows were columns match a string data type
def filter_col_by_string(the_data, field, filter_condition):
    filtered_rows = []
    
    #find index of field in first row
    col = int(the_data[0].index(field))
    filtered_rows.append(the_data[0])

    for row in the_data[1:]:
        if row[col] == filter_condition:
            filtered_rows.append([str(x).encode('utf8') for x in row])
            
    return filtered_rows

# Filter rows were columns match a float data type
def filter_col_by_float(the_data, field, direction, filter_condition):
    filtered_rows = []
    
    #find index of field in first row
    col = int(the_data[0].index(field))
    cond = float(filter_condition)
    
    for row in the_data[1:]:
        element = float(row[col])
        
        if direction == "<":
            if element < cond: filtered_rows.append(row)
                
        elif direction == "<=":
            if element <= cond: filtered_rows.append(row)

        elif direction == ">":
            if element > cond: filtered_rows.append(row)

        elif  direction == ">=":
            if element >= cond: filtered_rows.append(row)
                
        elif  direction == "==":
            if element == cond: filtered_rows.append(row)
        else:
            pass
        
    return filtered_rows


## Stage 3 end

# --------------------------------------

## Stage 4 begin

#4.a csv
def write_to_file(filename, data_sample):
    example = csv.writer(open(filename, 'w', newline='', encoding='utf-8'))
    example.writerows(data_sample)

#4.b more functions
def write_brand_and_price_to_file(filename, data_sample):

    # confirm that the columns only have two columns, otherwise take the two fields
    num_fields = len(data_sample[0])
    
    brand_field_index = 5 #int(dataSample[0].index("brand"))
    price_field_index = 2 #int(dataSample[0].index("priceLabel"))
    
    #if numFields > 2:
    new_array = []
    for record in data_sample:
        new_record = [None] * 2
        new_record[0] = record[brand_field_index]
        new_record[1] = record[price_field_index]
        new_array.append(new_record)

    # write the file
    write_to_file(filename, new_array)  

def write_min_max_csv(filename, data_sample):

    #find min & max price from data_sample
    min = find_max_min(data_sample, 2, "min")
    max = find_max_min(data_sample, 2, "max")

    new_array = []
    for record in data_sample:

        if (float(record[2]) == min) or (float(record[2]) == max): 
            new_array.append(record)

    write_to_file(filename, new_array)  


#csv with just 2 columns
def write_two_cols(filename, data_sample, col1, col2):    

    #if numFields > 2:
    new_array = []
    for record in data_sample:
        new_record = [None] * 2
        new_record[0] = record[col1]
        new_record[1] = record[col2]
        new_array.append(new_record)

    # write the file
    write_to_file(filename, new_array)

#csv with sorted prices
# ascending is going up i.e. A-Z 
# descending is going down i.e. Z-A 

def write_sorted_prices(filename, data_sample, order="ascending"):

    if order == "descending":
        data_sample.sort(key=lambda x: float(x[2]), reverse=False)
    else:
        data_sample.sort(key=lambda x: float(x[2]), reverse=True)

    write_to_file(filename, data_sample) 

#append another file
def write_append_file(filename, new_data_to_add):
    with open(filename, "a", encoding='utf-8') as myfile:
        for row in new_data_to_add:
            myfile.write(str(row))


#4.c export to excel
from openpyxl import Workbook
from openpyxl.cell import get_column_letter

def save_spreadsheet(filename, data_sample):
    
    wb = Workbook()
    ws = wb.active

    rowIndex = 1
    for rows in data_sample:
        colIndex = 1 
        for field in rows:
            colIndex2 = get_column_letter(colIndex)
            ws.cell('%s%s'%(colIndex2, rowIndex)).value = field
            colIndex +=1
        rowIndex += 1
    
    wb.save(filename)

## Stage 4 end

# --------------------------------------

## Stage 5 begin
import matplotlib.pyplot as plt

#5.a Line charts
def create_line_chart(sample, title, exported_figure_filename):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    prices = sorted(map(int, sample))
    x_axis_ticks = list( range(len(sample)) )
    ax.plot(x_axis_ticks, prices, label='price points', linewidth=2)
    ax.set_title(title)
    ax.set_xlabel('Tie Price ($)')
    ax.set_ylabel('Number of Ties')
    ax.set_xlim([0,len(sample)])
    fig.savefig(exported_figure_filename)


#5.b bar charts
def create_bar_chart(price_groups, exported_figure_filename):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    colors=plt.rcParams['axes.color_cycle']
     
    for group in price_groups:
        ax.bar(group, price_groups[group], color=colors[group%len(price_groups)])

    labels = ["$0-50", "$50-100", "$100-150", "$150-200", "$200-250", "$250+"]
    ax.legend(labels)

    ax.set_title('Amount of Ties at price points')
    ax.set_xlabel('Tie Price ($)')
    ax.set_xticklabels(labels, ha='left')
    ax.set_xticks( range(1, len(price_groups)+1) )
    ax.set_ylabel('Number of Ties')

    plt.grid(True)
    fig.savefig(exported_figure_filename)


#5.c tables

import prettytable
from prettytable import PrettyTable

def my_table(data_sample):

    x = PrettyTable(data_sample[0])
    x.align["City name"] = "l" 
    x.padding_width = 1 

    for row in data_sample[1:]:
        x.add_row(row)
    
    print(x)


def create_table(data_sample, price_groups, brand_names, columns, exported_figure_filename):
    tup = build_table_text(data_sample, brand_names)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    for group in price_groups:
        plt.bar(group, price_groups[group]) #color=colors[group%len(price_groups)]

    ax.table(cellText=tup[0], colLabels=columns, rowLabels=tup[1], loc='bottom')
    ax.text(-1.3, 0, 'Discounted Ties Brands', size=12, horizontalalignment='left', verticalalignment='top')
    ax.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        labelbottom='off') # labels along the bottom edge are off

    fig.savefig(exported_figure_filename, dpi=400, bbox_inches='tight')


from collections import Counter
def group_prices_by_range(prices_in_float):
    
    tally = Counter()

    for item in prices_in_float:
        bucket = 0
        rounded_price = round(item, -1)
        if rounded_price >= 0 and rounded_price <= 50:
            bucket = 1
        elif rounded_price >= 50 and rounded_price <= 100:
            bucket = 2
        elif rounded_price >= 100 and rounded_price <= 150:
            bucket = 3
        elif rounded_price >= 150 and rounded_price <= 200:
            bucket = 4
        elif rounded_price >= 200 and rounded_price <= 250:
            bucket = 5
        elif rounded_price >= 250:
            bucket = 6
        else:
            bucket = 7

        tally[bucket] += 1
    return tally

def count_prices_for_brands(data_sample, brand, min_price, max_price):
    count = 0
    for row in data_sample: 
        if str(row[0]) == str(brand):
            if float(row[1]) < max_price:
                if float(row[1]) > min_price:
                    count += 1
    return count

def build_table_text(data_sample, brands):  
    cell_text = []
    row_text = []

    unique_brand_list = sorted(set(brands))
    for b in unique_brand_list:
        b = bytes.decode(b)
        temp_row = [] 
        group1 = count_prices_for_brands(data_sample, b, 0, 50.00)
        group2 = count_prices_for_brands(data_sample, b, 50.00, 100.00)
        group3 = count_prices_for_brands(data_sample, b, 100.00, 150.00)
        group4 = count_prices_for_brands(data_sample, b, 150.00, 200.00)
        group5 = count_prices_for_brands(data_sample, b, 200.00, 250.00)
        group6 = count_prices_for_brands(data_sample, b, 250.00, 1000.00)
        row_list = [group1, group2, group3, group4, group5, group6]
        temp_row.extend(row_list) 
        
        if group1 > 0:
            if any(x >= group1 for x in row_list[1:]):
                cell_text.append(temp_row)
                row_text.append(b)

    return (cell_text, row_text)

def print_brand_avg_min(name, data_from_csv):
    tie_sample = filter_col_by_string(data_from_csv, "brandName", name)
    avg_price = calculate_sum(tie_sample) / len(tie_sample)
    min_price = find_min(tie_sample[1:], 2)
    print("{2} Average: ${0:6.2f}; Min: ${1:.2f}".format(avg_price, min_price, name))


## Stage 5 end

# --------------------------------------

## Stage 6 begin

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def plot_minimal_graph(tally, columns, *args):
    plt.style.use('bmh')
    fig = plt.figure(dpi=200) 
    colors=plt.rcParams['axes.color_cycle']
        
    # --- White background to use less printer ink --- #
    ax = plt.subplot(111,axisbg='white')
        
    # Plot bars and create text labels for the table
    for priceBucket in tally:
        ax.bar(priceBucket, tally[priceBucket], color=colors[priceBucket%len(tally)])
        ax.annotate(r"%d" % (tally[priceBucket]),
                   (priceBucket+0.2, tally[priceBucket]), 
                   va="bottom", ha="center")


    # --- Include a legend  --- #
    ax.legend(columns)
      
    # --- Remove distracting lines on top, left, and right --- #
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

    # --- Remove distracting tick marks  --- #
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')


    # --- Add chart title and axes labels --- #          
    plt.xlabel('Tie Price', fontsize = 13)
    plt.ylabel('Number of Ties', fontsize = 13)
    plt.title('Chart #1')


    # --- Add labels to bars along x axes --- #
    x = range(1, len(tally)+1)
    plt.xticks(x, columns, rotation='horizontal', ha='left')
            
    return fig
    
def plot_graph_with_table(cell_text, row_text, columns):
    plt.style.use('ggplot') 
    fig = plt.figure() 

    # --- Include table --- #
    ax2 = fig.add_subplot(111)
    ax2.axis('off')

    the_table = ax2.table(cellText=cell_text, 
                        rowLabels=row_text, 
                        colLabels=columns, 
                        loc='center right')


## Stage 6 end