README.md 3.32 KB
Newer Older
1
2
# Database Assignment 2

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# sai akhilesh Koosuri
# Student ID: 12583943
# Assignmnent - 2


import os
import pandas
import numpy as np
import time
from IPython.display import display
import hashlib
import os.path
import csv
from csv import writer
df = pandas.read_csv(
    os.path.dirname(os.path.abspath(__file__))+'/dataset.csv', encoding='utf-8')
filecount = 0
linecount = 1
while True:
    df.loc[[linecount-1]].to_csv(str(filecount)+'modified.csv',
                                 index=False,
                                 header=False,
                                 mode='a')

    if linecount % 100000 == 0:
        filecount = filecount + 1

    linecount = linecount+1


# Merge Sort
####################################################################
for i in range(0, 12):
    X = pandas.read_csv(str(i) + 'modified.csv', encoding='utf-8', header=None)
    X = X.sort_values(by=X.columns[26], axis=0, ascending=True,
                      inplace=False, kind='mergesort', na_position='last')
    X.to_csv(str(i) + 'sorted.csv', index=False, encoding='utf-8')

# Joining the sorted files
X = pandas.read_csv("0sorted.csv", encoding='utf-8', header=None)
Y = pandas.read_csv("1sorted.csv", encoding='utf-8', header=None)
Z = pandas.concat([X, Y], ignore_index=True)

for i in range(2, 12):
    Y = pandas.read_csv(str(i) + "sorted.csv", encoding='utf-8', header=None)
    Z = pandas.concat([Z, Y], ignore_index=True)

Z = Z.sort_values(by=Z.columns[26], axis=0, ascending=True,
                  inplace=False, kind='mergesort', na_position='last')
Z.to_csv('mergesorted.csv', index=False, encoding='utf-8')


# Linear Search
#############################################################
start = time.time()
for i in range(len(Z)):
    if (Z.iloc[i, 25] == "Sandman: Dream Hunters 30th Anniversary Edition"):
        print(Z.iloc[i, :])
end = time.time()
print(f"Runtime of the program is {end - start}")


# Hash Indexes
################################################################
Z = pandas.read_csv('mergesorted.csv', encoding='utf-8', header=None)
for i in range(len(Z)):
    # creates hash index for each title with 10 possible bins
    value = hash(Z.iloc[i, 25]) % 10
    exists = os.path.exists(str(value)+'.csv')
    if exists == 'True':                     # checks to see if a bin for that value already exists
        # opens the appropriate bin/file
        with open(str(value)+'.csv', mode='a', index=False, header=False) as f:
            writer = csv.writer(f)
            # writes the new title to the file
            writer.writerow(Z.iloc[i, :])
            f.close()

    else:
        # creates bin/file if one didn't exist
        with open(str(value) + '.csv', mode='a', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(Z.iloc[i, :])   # writes title to the file
            f.close()


# Hash Search
################################################
title = "Sandman: Dream Hunters 30th Anniversary Edition"

start = time.time()
value = hash(title) % 10
# find the corresponding bin and loads as a df
X = pandas.read_csv(str(value) + '.csv')
for i in range(len(X)):         # for loop that parses through df
    if (X.iloc[i, 25] == "Sandman: Dream Hunters 30th Anniversary Edition"):
        print(X.iloc[i, :])    # prints the tuple with that title
end = time.time()
print(f"Runtime of the program is {end - start}")