🧠Datasets
Create a dataset:
Create Dataset
POST
https://api.arkangel.ai/api/datasets/
To upload a dataset, you need to divide the file into parts no larger than 50MB. The first request will return an ID, which must be added to the following requests to upload the remaining parts.
Request Body
projectId*
Num
{{PROJECT_ID}}
dataset*
File
Include .zip file
datasetId*
Num
{{DATASET_ID}} (This field is only required after the first request.)
partNumber*
Num
Number from 1 to the total number of parts into which the file is divided.
totalParts*
Num
Total number of parts into which the file will be divided.
{
"status": 201,
"result": {
"id": "{{ID}}",
"originalFilename": "heart_2020_cleaned_numbers_target.zip",
"filename": "4a6fd4a3f6fed6d24ed362f1083dc459.zip",
"optimizationType": 0,
"optimizationMetric": auc,
"labelCols": ["target"],
"columnNames": ["Col_1", "Col_2", "Col_3"],
"trainingSpeed": "fast",
"studyType": "hippocrates",
"createdAt": "2022-11-22T04:36:12.557Z",
"updatedAt": "2022-11-22T04:36:12.558Z"
}
}
Example request
# Libraries needed to run the datasets upload trough API
import json
import math
import os
import requests
#---------------------------- Generator function -----------------------------#
# Generator function use to create stream parts from the zip file
def createFileChunks(file_object, chunk_size=1024):
# Infinite loop to iterate over all the file_object
while True:
# Read only chunk_size bytes from file_object
chunk_data = file_object.read(chunk_size)
# If the stram data readed is empty
if chunk_data == b'':
# There is no more data to be transmited. Terminate generator
break
# Pass stream of data
yield chunk_data
#---------------------------- Data input by user -----------------------------#
# Project id of an already initialize project
projectId = ''
# Token unique user identificator
token = ''
# Relative or absolute path of the zip file to be upload
pathFile = ''
#------------------------------ Local variables ------------------------------#
# Endpoint url
url = "https://api.arkangel.ai/api/datasets"
# DatasetId initialization variable
datasetId = ''
# FileObject structure for zip file reading
fileObject = open(pathFile, 'rb')
# Size of zip file
fileSize = os.path.getsize(pathFile)
# Predetermined chunk size (50 MB)
chunk_size = 1024 * 1024 * 50
# Calculate total number of parts base on chunk size and file size
totalParts = math.ceil(fileSize/chunk_size)
# If the file is smaller or equal to 50 MB
if fileSize <= 1024 * 1024 * 50:
# The file must be upload in a single stream
chunk_size = 1024 * 1024 * 50
totalParts = 1
#------------------------------- Upload script -------------------------------#
# Loop use to iterate over all the streams generated by createFileChunks
# generator function
for index, chunk in enumerate(createFileChunks(fileObject, chunk_size)):
# Create request payload
payload={'projectId': projectId,
'datasetId': datasetId,
'partNumber': str(index+1),
'totalParts': str(totalParts)}
# Add stream data to the request dataform
files=[('dataset', ('file', chunk, 'application/octet-stream'))]
# Add authentication token to request headers
headers = {'Authorization': token}
# Send request and cpature response on response variable
response = requests.request("POST",
url,
headers=headers,
data=payload,
files=files)
# Show usefull info on console
print(f'Part {index+1} of {totalParts} ...')
print(response.text)
# Get datasedId variable for stream upload of parts 2 until end
if (response.status_code == 201) and (index == 0):
dict_response = json.loads(response.text)
datasetId = dict_response['result']['id']
# Raise error in case of 500 code
if response.status_code == 500:
raise Exception('Load could not finish, an internal server error '
'occurred')
# Show usefull info on console
dict_response = json.loads(response.text)
print(f'\nProject id: {projectId}')
print(f'Dataset id: {datasetId}')
print('\nData Load finished !')
// ? CREATE THE CLASS UPLOADER
class MultiPartUploader {
constructor(url, file, chunkSize = 50) {
this.url = url;
this.file = file;
this.chunkSize = 1024 * 1024 * chunkSize // default 50MB;
this.progress = 0;
}
async upload(options = {}, onChunkUploaded) {
const fileSize = this.file.size;
const chunkCount = Math.ceil(fileSize / this.chunkSize);
let currentChunk = 1;
let res = null;
// Send subsequent requests to upload file chunks
while (currentChunk <= chunkCount) {
const start = (currentChunk - 1) * this.chunkSize;
const end = currentChunk * this.chunkSize;
const chunk = this.file.slice(start, end);
res = await onChunkUploaded({
options: { ...(options || {}), ...(res?.result || {}) },
chunk,
chunkCount,
partNumber: part,
});
if (res.status !== 201) break; // error uploading file
setProgress((part / chunkCount) * 100)
currentChunk++;
}
}
setProgress(progress) {
this.progress = progress;
}
}
//? USE THE CLASS
const fileInput = document.querySelector('input[type="file"]');
const uploadButton = document.querySelector('button[type="submit"]');
uploadButton.addEventListener("click", async () => {
const file = fileInput.files[0];
const uploader = new MultiPartUploader("https://api.arkangel.ai/api/datasets", file);
await uploader.upload(async ({ options, chunk, chunkCount, partNumber }) => {
const formData = new FormData();
formData.append("projectId", options?.projectId);
formData.append("partNumber", partNumber);
formData.append("totalParts", chunkCount);
formData.append("dataset", chunk);
if (options?.id) formData.append("datasetId", options?.id);
const headers = new Headers();
headers.append("Authorization", `Bearer ${TOKEN}`);
const res = await fetch(this.url, {
headers,
method: "POST",
body: formData,
});
return res;
});
console.log("File uploaded successfully!");
});
Update a dataset:
Update Dataset
PATCH
https://api.arkangel.ai/api/datasets
Request Body
datasetId*
UUID
{{DATASET_ID}}
originalFilename*
String
{{ORIGINAL_FILENAME}}
{
status: 200,
result: {
id: {{DATASET_ID}},
originalFilename: "name dataset",
route: "s3 route",
filename: ""
profilingReport: "SIGNED URL AWS",
"createdAt": "2022-11-21T16:56:21.672Z",
"updatedAt": "2022-11-21T16:56:21.672Z"
}
}
Example request
import json
import requests
url = "https://api.arkangel.ai/api/datasets"
update_dataset = {}
# Generate key for updating the dataset ID
update_dataset["datasetId"] = '{{DATASET_ID}}'
# Generate key for updating the File name
update_dataset["originalFilename"] = "GivenFilenameFromUser"
# Convert dictionary to json
payload = json.dumps(update_dataset, indent=4)
# Obtain user's web token
headers = {
'Authorization': 'Bearer {{TOKEN}}'
}
# Send the information created before and obtain response
response = requests.request("PATCH", url, headers=headers, data=payload)
print(response.text)
const token = "{{TOKEN}}";
const datasetId = "{{DATASET_ID}}";
const myHeaders = new Headers();
myHeaders.append("Authorization", `Bearer ${token}`);
myHeaders.append("Content-Type", "application/json");
const data = {
datasetId: datasetId,
originalFilename: "File experiment V1"
}
const requestOptions = {
method: 'PATCH',
headers: myHeaders,
body: JSON.stringify(data),
redirect: 'follow'
};
fetch("https://api.arkangel.ai/api/datasets", requestOptions)
.then(response => response.json())
.then(result => console.log(result))
.catch(error => console.log('error', error));
curl --location -g --request PATCH 'https://api.arkangel.ai/api/datasets' \
--header 'Authorization: Bearer {{TOKEN}}' \
--header 'Content-Type: application/json' \
--data-raw '{
"datasetId": {{DATASET_ID}},
"originalFilename": "asdasd"
}'ur
Load Dataset:
POST
https://api.arkangel.ai/api/datasets/load-dataset
Request Body
projectId*
UUID
{{PROJECT_ID}}
dataset*
File
Zip file
optimizationType*
Integer
0
optimizationMetric*
String
If it is classification use AUC is it is regression use MSE
labelCols*
Array<String>>
Target or Targets, name of columns
columnNames*
Array<Strings>
Predictor variables, name of columns
trainingSpeed*
String
fast | normal | exhaustive
{
"status": 201,
"result": {
"id": "{{ID}}",
"originalFilename": "heart_2020_cleaned_numbers_target.zip",
"filename": "4a6fd4a3f6fed6d24ed362f1083dc459.zip",
"optimizationType": 0,
"optimizationMetric": auc,
"labelCols": ["target"],
"columnNames": ["Col_1", "Col_2", "Col_3"],
"trainingSpeed": "fast",
"studyType": "hippocrates",
"createdAt": "2022-11-22T04:36:12.557Z",
"updatedAt": "2022-11-22T04:36:12.558Z"
}
}
import json
import requests
url = "https://api.arkangel.ai/api/datasets/load-dataset"
# Create a dictionary
data = dict()
data['ProjectId'] = "{{PROJECT_ID}}"
data['optimizationType'] = '0'
data['optimizationMetric'] = 'auc' # auc : classification; 1: regression
data['labelCols'] = '[Target]' # List of the target or targets column names
data['trainingSpeed'] = "exhaustive" # fast, normal or exhaustive
data['columnNames'] = '[col1, col2, col3]' # List with the names of the dataset's columns
data['files'] =[
('dataset',('file',open('/path/to/file','rb'),'application/octet-stream'))
]
# Convert dictionary to json
payload = json.dumps(data, indent=4)
# Obtain user's web token
headers = {
'Authorization': 'Bearer {{TOKEN}}'
}
# Send the information created before and obtain response
response = requests.request("POST", url, headers=headers, data=payload)
# Print response
print(response.text)
const token = "{{TOKEN}}";
const projectId = "{{PROJECT_ID}}";
const datasetId = "{{DATASET_ID}}";
const labelCols = ["target"];
const columnNames = ["BMI", "Smoking", "Diabetic"];
const myHeaders = new Headers();
myHeaders.append("Authorization", `Bearer ${token}`);
myHeaders.append("Content-Type", "application/json");
const formdata = new FormData();
formdata.append("projectId", projectId);
formdata.append("dataset", file);
formdata.append("optimizationType", "0");
formdata.append("optimizationMetric", "auc");
formdata.append("labelCols", labelCols);
formdata.append("trainingSpeed", "exhaustive");
formdata.append("columnNames", columnNames);
const requestOptions = {
method: 'POST',
headers: myHeaders,
body: formdata,
redirect: 'follow'
};
fetch("https://api.arkangel.ai/api/datasets/load-dataset", requestOptions)
.then(response => response.json())
.then(result => console.log(result))
.catch(error => console.log('error', error));
/curl --location --request POST 'https://api.arkangel.ai/api/datasets/load-dataset' \
--form 'projectId="fffff24d-0a5f-4c2f-a167-177af026835b"' \
--form 'dataset=@"/path/to/file'\''s/heart_2020_cleaned_numbers_target.zip"'
```powershell
--form 'optimizationType="0"' \
--form 'optimizationMetric="auc"' \
--form 'labelCols="[target]"' \
--form 'trainingSpeed="exhaustive"' \
--form 'columnNames="[BMI,Smoking,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer]"'
```
List datasets:
List Datasets
GET
https://api.arkangel.ai/api/datasets/
Query Parameters
projectId
UUID
Project Id
limit
Int
number of items
offset
int
number of page
Example request
import requests
url = "https://api.arkangel.ai/api/datasets?projectId={{PROJECT_ID}}&typeExperiment=data&limit=10&offset=0"
payload="<file contents here>"
headers = {
'Authorization': 'Bearer {{TOKEN}}',
'Content-Type': 'text/plain'
}
response = requests.request("GET", url, headers=headers, data=payload)
print(response.text)
const token = "{{TOKEN}}";
const projectId = "{{PROJECT_ID}}";
const typeExperiment = "data";
const limit = 10;
const offset = 0;
const myHeaders = new Headers();
myHeaders.append("Authorization", `Bearer ${token}`);
const requestOptions = {
method: 'GET',
headers: myHeaders,
redirect: 'follow'
};
fetch(`https://api.arkangel.ai/api/datasets?projectId=${projectId}&typeExperiment=${typeExperiment}&limit=${limit}&offset=${offset}`, requestOptions)
.then(response => response.json())
.then(result => console.log(result))
.catch(error => console.log('error', error)
curl --location -g --request GET 'https://api.arkangel.ai/api/datasets?projectId={{PROJECT_ID}}&typeExperiment=data&limit=10&offset=0' \
--header 'Authorization: Bearer {{TOKEN}}' \
--header 'Content-Type: text/plain' \
--data-binary '@'
Delete a dataset:
Delete a Dataset
DELETE
https://api.arkangel.ai/api/datasets/datasetId
Path Parameters
datasetId*
UUID
Dataset Id
{
status: 400,
result: "Dataset is a template and can be deleted"
}
{
status: 400,
result: "Dataset is used in a project"
}
{
status: 200,
result: "Dataset Deleted"
}
Example request
import requests
#replace this with dataset ID
DATASET_ID = '{{DATASET_ID}}'
url = f"https://api.arkangel.ai/api/datasets/{DATASET_ID}"
payload={}
#obtain user's web token
headers = {
'Authorization': 'Bearer {{TOKEN}}'
}
response = requests.request("DELETE", url, headers=headers, data=payload)
print(response.text)
const token = "{{TOKEN}}";
const datasetId = "{{DATASET_ID}}";
const myHeaders = new Headers();
myHeaders.append("Authorization", `Bearer ${token}`);
const requestOptions = {
method: 'DELETE',
headers: myHeaders,
redirect: 'follow'
};
fetch(`https://api.arkangel.ai/api/datasets/${datasetId}`, requestOptions)
.then(response => response.json())
.then(result => console.log(result))
.catch(error => console.log('error', error)
curl --location -g --request DELETE 'https://api.arkangel.ai/api/datasets/{{DATASET_ID}}' \
--header 'Authorization: Bearer {{TOKEN}}'
Create preprocess:
Create a preprocess on your dataset
POST
https://api.arkangel.ai/api/datasets/
When you create a dataset, Arkangel AI will automatically generate a dataset exploratory analysis and flag insights, correlations, iteractions and potential errors that we call "Alerts."
The prepocess endpoint allows you to improve the quality of your dataset to generate better AI models automatically.
Example request
import requests
import json
url = "https://api.arkangel.ai/api/datasets/preprocessing"
#create dictionary for preprocessing the dataset
preproc = dict()
#create a key that contains the ID of the project to be preprocessed
preproc['projectId'] = "{{PROJECT_ID}}"
#create a key containing the ID of the dataset to be preprocessed
preproc['datasetId'] = "{{DATASET_ID}}"
#create a key containing all the processes to be made on the dataset
# actions are made on columns that contain a constant value
# throughout the entirety of their rows
preproc['actions'] = {
"IDAREA": {
"name": "REJECTED",
"value": "delete"
},
"DESCAREA": {
"name": "CONSTANT",
"value": "delete"
},
"CUMPLIDA": {
"name": "CONSTANT",
"value": "delete"
},
"PREFIJO": {
"name": "CONSTANT",
"value": "delete"
},
"CANTIDAD": {
"name": "CONSTANT",
"value": "delete"
}
}
# replaces are made on specific values that are misspelled or that
# the user wants to be named differently.
preproc['replace'] = {
"TipoAdmision": [
{
"str": "AMBULATORIA",
"replace": "AMBULATORIAA"
},
{
"str": "PAQETES",
"replace": "PAQUETES"
}
],
"ORIGEN": [
{
"str": "COLOMVIA",
"replace": "COLOMBIA"
}
]
}
# delete rows with missing or NAN values, or implement kmeans imputation on
# them, depending on needs
preproc['inputation'] = 'delete'
payload = json.dumps(data, indent=4)
#obtain user's web token
headers = {
'Authorization': 'Bearer {{TOKEN}}',
'Content-Type': 'application/json'
}
files=[
('dataset',('file',open('Bases de datos\\NIH_Chest_X-Rays\\classification\\Thorax_diseases_classification\\Thorax_Diseases_Classification_Template_2.zip','rb'),'application/octet-stream'))
]
response = requests.request("POST", url, headers=headers, data=payload, files=files)
print(response.text)
const token = "{{TOKEN}}";
const projectId = "{{PROJECT_ID}}";
const datasetId = "{{DATASET_ID}}";
const myHeaders = new Headers();
myHeaders.append("Authorization", `Bearer ${token}`);
myHeaders.append("Content-Type", "application/json");
const data = {
"projectId": projectId,
"datasetId": datasetId,
"actions": {
"IDAREA": {
"name": "REJECTED",
"value": "delete"
},
"DESCAREA": {
"name": "CONSTANT",
"value": "delete"
},
"CUMPLIDA": {
"name": "CONSTANT",
"value": "delete"
},
"PREFIJO": {
"name": "CONSTANT",
"value": "delete"
},
"CANTIDAD": {
"name": "CONSTANT",
"value": "delete"
}
},
"replace": {
"TipoAdmision": [
{
"str": "AMBULATORIA",
"replace": "AMBULATORIAA"
},
{
"str": "PAQETES",
"replace": "PAQUETES"
}
],
"ORIGEN": [
{
"str": "COLOMVIA",
"replace": "COLOMBIA"
}
]
},
"inputation": "delete"
};
const requestOptions = {
method: 'POST',
headers: myHeaders,
body: JSON.stringify(data),
redirect: 'follow'
};
fetch("https://api.arkangel.ai/api/datasets/preprocessing", requestOptions)
.then(response => response.json())
.then(result => console.log(result))
.catch(error => console.log('error', error));
curl --location -g --request POST 'https://api.arkangel.ai/api/datasets/preprocessing' \
--header 'Authorization: Bearer {{TOKEN}}' \
--header 'Content-Type: application/json' \
--data-raw '{
"projectId": "{{PROJECT_ID}}",
"datasetId": "{{DATASET_ID}}",
"actions": {
"IDAREA": {
"name": "REJECTED",
"value": "delete"
},
"DESCAREA": {
"name": "CONSTANT",
"value": "delete"
},
"CUMPLIDA": {
"name": "CONSTANT",
"value": "delete"
},
"PREFIJO": {
"name": "CONSTANT",
"value": "delete"
},
"CANTIDAD": {
"name": "CONSTANT",
"value": "delete"
}
},
"replace": {
"TipoAdmision": [
{
"str": "AMBULATORIA",
"replace": "AMBULATORIAA"
},
{
"str": "PAQETES",
"replace": "PAQUETES"
}
],
"ORIGEN": [
{
"str": "COLOMVIA",
"replace": "COLOMBIA"
}
]
},
"imputation": "delete"
}'
Last updated