# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function from collections import Counter import pandas as pd from django.contrib.auth.decorators import user_passes_test from django.shortcuts import redirect, render from django.urls import reverse from dataops import ops, pandas_db from ontask.permissions import is_instructor from workflow.ops import get_workflow from .forms import UploadCSVFileForm from unidecode import unidecode @user_passes_test(is_instructor) def remove_non_ascii(text): return unidecode(unicode(text, encoding = "utf-8")) def csvupload1(request): """ The four step process will populate the following dictionary with name upload_data (divided by steps in which they are set STEP 1: initial_column_names: List of column names in the initial file. column_types: List of column types as detected by pandas src_is_key_column: Boolean list with src columns that are unique step_1: URL name of the first step :param request: Web request :return: Creates the upload_data dictionary in the session """ # Get the current workflow workflow = get_workflow(request) if not workflow: return redirect('workflow:index') # Bind the form with the received data form = UploadCSVFileForm(request.POST or None, request.FILES or None) # Process the initial loading of the form if request.method != 'POST': return render(request, 'dataops/upload1.html', {'form': form, 'wid': workflow.id, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # Process the reception of the file if not form.is_multipart(): msg = "CSV upload form is not multiform" context = {'message': msg} meta = request.META.get('HTTP_REFERER', None) if meta: context['meta'] = meta return render(request, 'critical_error.html', context=context) # If not valid, this is probably because the file submitted was too big if not form.is_valid(): return render(request, 'dataops/upload1.html', {'form': form, 'wid': workflow.id, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # Process CSV file using pandas read_csv try: #data_frame = pandas_db.load_df_from_csvfile( # request.FILES['file'], # form.cleaned_data['skip_lines_at_top'], # form.cleaned_data['skip_lines_at_bottom']) data_frame = pd.read_csv( request.FILES['file'], index_col=False, infer_datetime_format=True, quotechar='"', skiprows=form.cleaned_data['skip_lines_at_top'], skipfooter=form.cleaned_data['skip_lines_at_bottom'] #,encoding='utf-8' ) # Strip white space from all string columns and try to convert to # datetime just in case cols = {} for x in list(data_frame.columns): y=remove_non_ascii(x.strip()) cols[x]=y if data_frame[x].dtype.name == 'object': # Column is a string! #data_frame[x] = data_frame[x].str.strip() # Try the datetime conversion try: series = pd.to_datetime(data_frame[x], infer_datetime_format=True) # Datetime conversion worked! Update the data_frame data_frame[x] = series except ValueError: pass data_frame.rename(columns=cols, inplace=True ) except Exception as e: form.add_error('file', 'File could not be processed ({0})'.format(e.message)) return render(request, 'dataops/upload1.html', {'form': form, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # If the frame has repeated column names, it will not be processed. if len(set(data_frame.columns)) != len(data_frame.columns): dup = [x for x, v in Counter(list(data_frame.columns)) if v > 1] form.add_error( 'file', 'The file has duplicated column names (' + ','.join(dup) + ').') return render(request, 'dataops/upload1.html', {'form': form, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # If the data frame does not have any unique key, it is not useful (no # way to uniquely identify rows). There must be at least one. src_is_key_column = ops.are_unique_columns(data_frame) if not any(src_is_key_column): form.add_error( 'file', 'The data has no column with unique values per row. ' 'At least one column must have unique values.') return render(request, 'dataops/upload1.html', {'form': form, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # Store the data frame in the DB. try: # Get frame info with three lists: names, types and is_key frame_info = ops.store_upload_dataframe_in_db(data_frame, workflow.id) except Exception as e: form.add_error( 'file', 'Sorry. This file cannot be processed.' ) return render(request, 'dataops/upload1.html', {'form': form, 'dtype': 'CSV', 'dtype_select': 'CSV file', 'prev_step': reverse('dataops:list')}) # Dictionary to populate gradually throughout the sequence of steps. It # is stored in the session. request.session['upload_data'] = { 'initial_column_names': frame_info[0], 'column_types': frame_info[1], 'src_is_key_column': frame_info[2], 'step_1': 'dataops:csvupload1' } return redirect('dataops:upload_s2')
Edit page

