Commit d76afead authored by Jacob Willhoite's avatar Jacob Willhoite
Browse files

submit

parents
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.9 (venv)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (venv)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/PA2_Project.iml" filepath="$PROJECT_DIR$/.idea/PA2_Project.iml" />
</modules>
</component>
</project>
\ No newline at end of file
import time
import pandas as pd
def hashing(titleToFind):
#Setup
ht = {}
f = pd.read_csv("dataset.csv")
columns = list(f.columns.values)
#Hash all row titles and use as key for the row data
for i in range(len(f)):
titleHash = str(hash(f['title'].iloc[i])) #get hash of title
#Put in our HT
if(titleHash not in ht):
ht[titleHash] = list()
listToAppend = list(f.iloc[i])
ht[titleHash].append(listToAppend)
print(" Done Hashing. Starting Search...")
#start timer
tic = time.perf_counter()
#Find Correct Bucket
titleHash = str(hash(titleToFind))
if(titleHash in ht):
rows = ht[titleHash]
# Search That Bucket For Title We Are Looking For
for i in range(len(rows)):
df = pd.DataFrame([rows[i]], columns=columns)
pd.set_option('display.max_columns', None)
if(df['title'].iloc[0] == titleToFind):
# stop timer
toc = time.perf_counter()
# return time taken with 4 decimal places
print(" Found Record With Title '" + titleToFind + "': ")
print(df.iloc[i]) # display the record
return round(toc - tic, 4)
else:
# not found
toc = time.perf_counter() # stop timer
print(f" Search took {toc - tic:0.4f} seconds")
return False
import pandas as pd
import time
def linear_search(titleToFind):
#start timer
tic = time.perf_counter()
#load in a chunk
df = pd.read_csv("dataset_sorted.csv", nrows=100000)
#setup
columns = list(df.columns.values)
rowsToSkip = 100000
while(len(df) > 0):
#linear search the chunk for search criteria
for i in range(len(df)):
if (df["title"].iloc[i] == titleToFind): #if we find it
# stop timer
toc = time.perf_counter()
#Display the record
print(" Found Record With Title '" +titleToFind+ "': ")
print(df.iloc[i])
return round(toc-tic, 4) #return time taken with 4 decimal places
#Not in that chunk, so load next chunk
try:
df = pd.read_csv("dataset_sorted.csv", nrows=100000, skiprows=rowsToSkip, names=columns)
rowsToSkip += 100000
except:
break #no more chunks
#not found in any chunks
toc = time.perf_counter() # stop timer
print(f" Search took {toc - tic:0.4f} seconds")
return False
\ No newline at end of file
import split
import merge_sort
import linear_search
import hash_index
def main():
print('Splitting...')
split.splitCSV("dataset.csv")
print(" Finish splitting file.")
print('\nMerge-Sorting...')
merge_sort.merge_sort()
print(" Finish merging/sorting files.")
print('\nLinear Searching...')
timeTaken_linear = linear_search.linear_search("Sandman: Dream Hunters 30th Anniversary Edition")
if(timeTaken_linear == False):
print('Could Not Find Entry With Title Sandman: Dream Hunters 30th Anniversary Edition')
else:
print('Time Taken To Complete Linear Search: ' +str(timeTaken_linear)+ " Seconds \n")
print('\nHash Indexing...')
timeTaken_hash = hash_index.hashing("Sandman: Dream Hunters 30th Anniversary Edition")
if(timeTaken_hash == False):
print('Could Not Find Entry With Title Sandman: Dream Hunters 30th Anniversary Edition')
else:
print('Time Taken To Complete Hash Index Search: ' +str(timeTaken_hash)+ " Seconds \n")
if __name__ == '__main__':
main()
import os
import pandas as pd
import glob
def initial_sort():
print(" Sorting All Files Individually...")
#Get all files from split
files_to_sort = glob.glob("dataset_*.csv")
#For each file, sort it by title
for f in files_to_sort:
file = pd.read_csv(f)
fileSorted = file.sort_values(by=["title"])
fileSorted.to_csv(f, index=False)
def merge_sort():
#Setup
initial_sort() # Sort all split files individually
files_to_merge = sorted(glob.glob("dataset_*.csv"), key=os.path.getsize) #get all files and sort by ascending size
files_left = len(files_to_merge)
print(" Merging {} files...".format(files_left))
temp_file_count = files_left + 1 #For making new merged files
#While there are still files to merge
while files_left > 1:
#Get out file and two in files
temp_file = "dataset_{}.csv".format(temp_file_count)
first_file = pd.read_csv(files_to_merge[0])
second_file = pd.read_csv(files_to_merge[1])
#Because the files are sorted, we can merge them line by line
merged = pd.concat([first_file, second_file], keys='title', sort=True)
merged.to_csv(temp_file, index=False)
# Clean up files and update values.
os.remove(files_to_merge[0]) #remove bc we merged already into new file
os.remove(files_to_merge[1]) #remove bc we merged already into new file
temp_file_count += 1
files_to_merge = sorted(glob.glob("dataset_*.csv"), key=os.path.getsize) #get all files and sort by ascending size
files_left = len(files_to_merge)
os.rename(files_to_merge[0], "dataset_sorted.csv")
return
'''
# Below is the algorithm without using pandas merge function and
# me writing it manually. NOTE: it is much slower.
import bisect
def merge_sort():
initial_sort()
files_to_merge = glob.glob("dataset_*.csv")
files_left = len(files_to_merge)
print(" Merging {} files...".format(files_left))
temp_file_count = files_left + 1
temp_file = "dataset{}.csv".format(temp_file_count) #TODO CHANGE BACK TO dataset_
while files_left > 1:
with open(temp_file, 'w', encoding='utf-8', newline='\n') as tmp_file:
first_file = pd.read_csv(files_to_merge[0], nrows=1)
headers = list(first_file.columns.values)
tmp_file.write(','.join(headers))
tmp_file.write('\n')
rowsToSkip_firstFile = 2
rowsToSkip_secondFile = 2 #header counts as a row
first_file = pd.read_csv(files_to_merge[0], nrows=1)
second_file = pd.read_csv(files_to_merge[1], nrows=1)
while(len(first_file) > 0 and len(second_file) > 0):
if(str(first_file['title'][0]) <= str(second_file['title'][0])):
first_file.iloc[[0]].to_csv(tmp_file, index=False, header=False, mode='a')
first_file = first_file.drop(first_file.index[0])
first_file.reset_index(drop=True, inplace=True)
else:
second_file.iloc[[0]].to_csv(tmp_file, index=False, header=False, mode='a')
second_file = second_file.drop(second_file.index[0])
second_file.reset_index(drop=True, inplace=True)
if(len(first_file) < 5):
first_file = pd.concat([first_file, pd.read_csv(files_to_merge[0], nrows=1, skiprows=rowsToSkip_firstFile, names=headers)])
first_file.reset_index(drop=True, inplace=True)
rowsToSkip_firstFile += 1
if(len(second_file) < 5):
second_file = pd.concat([second_file, pd.read_csv(files_to_merge[1], nrows=1, skiprows=rowsToSkip_secondFile, names=headers)])
second_file.reset_index(drop=True, inplace=True)
rowsToSkip_secondFile += 1
# One file is empty so write the rest of nonempty file
if (len(first_file) < 1):
for i in range(len(second_file)):
second_file.iloc[[0]].to_csv(tmp_file, index=False, header=False, mode='a')
second_file = second_file.drop(second_file.index[0])
elif (len(second_file) < 1):
for i in range(len(first_file)):
first_file.iloc[[0]].to_csv(tmp_file, index=False, header=False, mode='a')
first_file = first_file.drop(first_file.index[0])
tmp_file.close()
# Clean up files and update values.
os.remove(files_to_merge[0])
os.remove(files_to_merge[1])
temp_file_count += 1
files_to_merge = glob.glob("dataset_*.csv")
files_left = len(files_to_merge)
print("Finish merging files.")
'''
\ No newline at end of file
def splitCSV(filenameCSV):
#Setup
fileCSV = open(filenameCSV, 'r', encoding="utf-8").readlines()
filename = filenameCSV[:-4] #trim off '.csv'
header = fileCSV[0] #store header values
fileCSV.pop(0) #remove header from list
record_per_file = 100000
file = 1
#Write out new files
for j in range(len(fileCSV)):
if j % record_per_file == 0:
write_file = fileCSV[j:j+record_per_file] #next block of records
write_file.insert(0, header) #add header
open(str(filename) +"_"+ str(file) + '.csv', 'w', encoding="utf-8").writelines(write_file) #Write out file
file += 1
return
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment