🧠Datasets

Create a dataset:

Create Dataset

POST https://api.arkangel.ai/api/datasets/

To upload a dataset, you need to divide the file into parts no larger than 50MB. The first request will return an ID, which must be added to the following requests to upload the remaining parts.

Request Body

Name
Type
Description

projectId*

Num

{{PROJECT_ID}}

dataset*

File

Include .zip file

datasetId*

Num

{{DATASET_ID}} (This field is only required after the first request.)

partNumber*

Num

Number from 1 to the total number of parts into which the file is divided.

totalParts*

Num

Total number of parts into which the file will be divided.

{
  "status": 201,
  "result": {
    "id": "{{ID}}",
    "originalFilename": "heart_2020_cleaned_numbers_target.zip",
    "filename": "4a6fd4a3f6fed6d24ed362f1083dc459.zip",
    "optimizationType": 0,
    "optimizationMetric": auc,
    "labelCols": ["target"],
    "columnNames": ["Col_1", "Col_2", "Col_3"],
    "trainingSpeed": "fast",
    "studyType": "hippocrates",
    "createdAt": "2022-11-22T04:36:12.557Z",
    "updatedAt": "2022-11-22T04:36:12.558Z"
  }
}

Example request

# Libraries needed to run the datasets upload trough API
import json
import math
import os
import requests


#---------------------------- Generator function -----------------------------#
# Generator function use to create stream parts from the zip file
def createFileChunks(file_object, chunk_size=1024):
    # Infinite loop to iterate over all the file_object
    while True:
        # Read only chunk_size bytes from file_object
        chunk_data = file_object.read(chunk_size)

        # If the stram data readed is empty
        if chunk_data == b'':
            # There is no more data to be transmited. Terminate generator
            break

        # Pass stream of data
        yield chunk_data


#---------------------------- Data input by user -----------------------------#
# Project id of an already initialize project 
projectId = ''
# Token unique user identificator
token = ''
# Relative or absolute path of the zip file to be upload
pathFile = ''

#------------------------------ Local variables ------------------------------#
# Endpoint url
url = "https://api.arkangel.ai/api/datasets"
# DatasetId initialization variable
datasetId = ''
# FileObject structure for zip file reading
fileObject = open(pathFile, 'rb')
# Size of zip file
fileSize = os.path.getsize(pathFile)
# Predetermined chunk size (50 MB)
chunk_size = 1024 * 1024 * 50

# Calculate total number of parts base on chunk size and file size
totalParts = math.ceil(fileSize/chunk_size)

# If the file is smaller or equal to 50 MB 
if fileSize <= 1024 * 1024 * 50:
    # The file must be upload in a single stream
    chunk_size = 1024 * 1024 * 50
    totalParts = 1

#------------------------------- Upload script -------------------------------#
# Loop use to iterate over all the streams generated by createFileChunks
# generator function
for index, chunk in enumerate(createFileChunks(fileObject, chunk_size)):
    # Create request payload
    payload={'projectId': projectId,
             'datasetId': datasetId,
             'partNumber': str(index+1),
             'totalParts': str(totalParts)}

    # Add stream data to the request dataform
    files=[('dataset', ('file', chunk, 'application/octet-stream'))]

    # Add authentication token to request headers
    headers = {'Authorization': token}

    # Send request and cpature response on response variable
    response = requests.request("POST",
                                url,
                                headers=headers,
                                data=payload,
                                files=files)

    # Show usefull info on console
    print(f'Part {index+1} of {totalParts} ...')
    print(response.text)

    # Get datasedId variable for stream upload of parts 2 until end
    if (response.status_code == 201) and (index == 0):
        dict_response = json.loads(response.text)
        datasetId = dict_response['result']['id']

# Raise error in case of 500 code
if response.status_code == 500:
    raise Exception('Load could not finish, an internal server error '
                    'occurred')

# Show usefull info on console
dict_response = json.loads(response.text)
print(f'\nProject id: {projectId}')
print(f'Dataset id: {datasetId}')

print('\nData Load finished !')

Update a dataset:

Update Dataset

PATCH https://api.arkangel.ai/api/datasets

Request Body

Name
Type
Description

datasetId*

UUID

{{DATASET_ID}}

originalFilename*

String

{{ORIGINAL_FILENAME}}

{
   status: 200,
   result: {
      id: {{DATASET_ID}},
      originalFilename: "name dataset",
      route: "s3 route",
      filename: ""
      profilingReport: "SIGNED URL AWS",
      "createdAt": "2022-11-21T16:56:21.672Z",
      "updatedAt": "2022-11-21T16:56:21.672Z"
   }
}

Example request

import json
import requests

url = "https://api.arkangel.ai/api/datasets"

update_dataset = {}
# Generate key for updating the dataset ID
update_dataset["datasetId"] = '{{DATASET_ID}}'
# Generate key for updating the File name
update_dataset["originalFilename"] = "GivenFilenameFromUser"
# Convert dictionary to json
payload = json.dumps(update_dataset, indent=4)

# Obtain user's web token
headers = {
  'Authorization': 'Bearer {{TOKEN}}'
}
# Send the information created before and obtain response
response = requests.request("PATCH", url, headers=headers, data=payload)



print(response.text)

Load Dataset:

POST https://api.arkangel.ai/api/datasets/load-dataset

Request Body

Name
Type
Description

projectId*

UUID

{{PROJECT_ID}}

dataset*

File

Zip file

optimizationType*

Integer

0

optimizationMetric*

String

If it is classification use AUC is it is regression use MSE

labelCols*

Array<String>>

Target or Targets, name of columns

columnNames*

Array<Strings>

Predictor variables, name of columns

trainingSpeed*

String

fast | normal | exhaustive

{
  "status": 201,
  "result": {
    "id": "{{ID}}",
    "originalFilename": "heart_2020_cleaned_numbers_target.zip",
    "filename": "4a6fd4a3f6fed6d24ed362f1083dc459.zip",
    "optimizationType": 0,
    "optimizationMetric": auc,
    "labelCols": ["target"],
    "columnNames": ["Col_1", "Col_2", "Col_3"],
    "trainingSpeed": "fast",
    "studyType": "hippocrates",
    "createdAt": "2022-11-22T04:36:12.557Z",
    "updatedAt": "2022-11-22T04:36:12.558Z"
  }
}
import json
import requests

url = "https://api.arkangel.ai/api/datasets/load-dataset"

# Create a dictionary
data = dict()
data['ProjectId'] = "{{PROJECT_ID}}"
data['optimizationType'] = '0'
data['optimizationMetric'] = 'auc'   # auc : classification; 1: regression
data['labelCols'] = '[Target]'  # List of the target or targets column names
data['trainingSpeed'] = "exhaustive"    # fast, normal or exhaustive
data['columnNames'] = '[col1, col2, col3]'  # List with the names of the dataset's columns
data['files'] =[
('dataset',('file',open('/path/to/file','rb'),'application/octet-stream'))
]

# Convert dictionary to json
payload = json.dumps(data, indent=4)

# Obtain user's web token
headers = {
  'Authorization': 'Bearer {{TOKEN}}'
}
# Send the information created before and obtain response
response = requests.request("POST", url, headers=headers, data=payload)

# Print response
print(response.text)

List datasets:

List Datasets

GET https://api.arkangel.ai/api/datasets/

Query Parameters

Name
Type
Description

projectId

UUID

Project Id

limit

Int

number of items

offset

int

number of page

Example request

import requests

url = "https://api.arkangel.ai/api/datasets?projectId={{PROJECT_ID}}&typeExperiment=data&limit=10&offset=0"

payload="<file contents here>"
headers = {
  'Authorization': 'Bearer {{TOKEN}}',
  'Content-Type': 'text/plain'
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)

Delete a dataset:

Delete a Dataset

DELETE https://api.arkangel.ai/api/datasets/datasetId

Path Parameters

Name
Type
Description

datasetId*

UUID

Dataset Id

{
    status: 400,
    result: "Dataset is a template and can be deleted"
}

Example request

import requests

#replace this with dataset ID
DATASET_ID = '{{DATASET_ID}}'
url = f"https://api.arkangel.ai/api/datasets/{DATASET_ID}"

payload={}

#obtain user's web token
headers = {
  'Authorization': 'Bearer {{TOKEN}}'
}

response = requests.request("DELETE", url, headers=headers, data=payload)

print(response.text)

Create preprocess:

Create a preprocess on your dataset

POST https://api.arkangel.ai/api/datasets/

When you create a dataset, Arkangel AI will automatically generate a dataset exploratory analysis and flag insights, correlations, iteractions and potential errors that we call "Alerts."

The prepocess endpoint allows you to improve the quality of your dataset to generate better AI models automatically.

Example request

import requests
import json

url = "https://api.arkangel.ai/api/datasets/preprocessing"

#create dictionary for preprocessing the dataset
preproc = dict()
#create a key that contains the ID of the project to be preprocessed
preproc['projectId'] = "{{PROJECT_ID}}"
#create a key containing the ID of the dataset to be preprocessed
preproc['datasetId'] = "{{DATASET_ID}}"
#create a key containing all the processes to be made on the dataset

# actions are made on columns that contain a constant value
# throughout the entirety of their rows
preproc['actions'] = {
    "IDAREA": {
      "name": "REJECTED",
      "value": "delete"
    },
    "DESCAREA": {
      "name": "CONSTANT",
      "value": "delete"
    },
    "CUMPLIDA": {
      "name": "CONSTANT",
      "value": "delete"
    },
    "PREFIJO": {
      "name": "CONSTANT",
      "value": "delete"
    },
    "CANTIDAD": {
      "name": "CONSTANT",
      "value": "delete"
    }
  }
# replaces are made on specific values that are misspelled or that
# the user wants to be named differently.
preproc['replace'] = {
    "TipoAdmision": [
      {
        "str": "AMBULATORIA",
        "replace": "AMBULATORIAA"
      },
      {
        "str": "PAQETES",
        "replace": "PAQUETES"
      }
    ],
    "ORIGEN": [
      {
        "str": "COLOMVIA",
        "replace": "COLOMBIA"
      }
    ]
  }
# delete rows with missing or NAN values, or implement kmeans imputation on
# them, depending on needs
preproc['inputation'] = 'delete'

payload = json.dumps(data, indent=4)

#obtain user's web token
headers = {
  'Authorization': 'Bearer {{TOKEN}}',
  'Content-Type': 'application/json'
}

files=[
  ('dataset',('file',open('Bases de datos\\NIH_Chest_X-Rays\\classification\\Thorax_diseases_classification\\Thorax_Diseases_Classification_Template_2.zip','rb'),'application/octet-stream'))
]

response = requests.request("POST", url, headers=headers, data=payload, files=files)

print(response.text)

Last updated