PDF to HTML#
Available Methods#
/pdf/convert/to/html#
Convert PDF and scanned images into HTML representation with text, fonts, images, vectors, formatting preserved.
Method: POST
Endpoint: /v1/pdf/convert/to/html
Attributes#
Note
Attributes are case-sensitive and should be inside JSON for POST request, for example:
{
"url": "https://example.com/file1.pdf"
}
Attribute |
Description |
Required |
---|---|---|
|
URL to the source file. 1 |
yes |
|
HTTP auth user name if required to access source |
no |
|
HTTP auth password if required to access source |
no |
|
Specify page indices as comma-separated values or ranges to process (e.g. |
no |
|
Unwrap lines into a single line within table cells when |
no |
|
Defines coordinates for extraction, e.g. |
no |
|
Set the language for OCR (text from image) to use for scanned PDF, PNG, and JPG documents input when extracting text. The default is |
no |
|
Set to |
no |
|
Line grouping within table cells. Set to |
no |
|
Password of PDF file, the input must be in string format. |
no |
|
Set |
no |
|
File name for the generated output, the input must be in string format. |
no |
|
Set the expiration time for the output link in minutes (default is |
no |
|
Use this parameter to set additional configurations for fine-tuning and extra options. Explore the Profiles section for more. |
no |
Query parameters#
No query parameters accepted.
Payload 3#
{
"url": "https://pdfco-test-files.s3.us-west-2.amazonaws.com/pdf-to-html/sample.pdf",
"inline": false,
"async": false
}
Response 2#
{
"url": "https://pdf-temp-files.s3.amazonaws.com/a7a86e9f29f84f5180624bdec1facfc2/index.html",
"pageCount": 1,
"error": false,
"status": 200,
"name": "sample.html",
"remainingCredits": 60110
}
CURL#
curl --location --request POST 'https://api.pdf.co/v1/pdf/convert/to/html' \
--header 'x-api-key: *******************' \
--header 'Content-Type: application/json' \
--data-raw '{
"url": "https://pdfco-test-files.s3.us-west-2.amazonaws.com/pdf-to-html/sample.pdf",
"inline": false,
"async": false
}'
Code samples#
var https = require("https");
var path = require("path");
var fs = require("fs");
// `request` module is required for file upload.
// Use "npm install request" command to install.
var request = require("request");
// The authentication key (API Key).
// Get your own by registering at https://app.pdf.co
const API_KEY = "***********************************";
// Source PDF file
const SourceFile = "./sample.pdf";
// Comma-separated list of page indices (or ranges) to process. Leave empty for all pages. Example: '0,2-5,7-'.
const Pages = "";
// PDF document password. Leave empty for unprotected documents.
const Password = "";
// Destination HTML file name
const DestinationFile = "./result.html";
// Set to `true` to get simplified HTML without CSS. Default is the rich HTML keeping the document design.
const PlainHtml = false;
// Set to `true` if your document has the column layout like a newspaper.
const ColumnLayout = false;
// 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
getPresignedUrl(API_KEY, SourceFile)
.then(([uploadUrl, uploadedFileUrl]) => {
// 2. UPLOAD THE FILE TO CLOUD.
uploadFile(API_KEY, SourceFile, uploadUrl)
.then(() => {
// 3. CONVERT UPLOADED PDF FILE TO HTML
convertPdfToHtml(API_KEY, uploadedFileUrl, Password, Pages, PlainHtml, ColumnLayout, DestinationFile);
})
.catch(e => {
console.log(e);
});
})
.catch(e => {
console.log(e);
});
function getPresignedUrl(apiKey, localFile) {
return new Promise(resolve => {
// Prepare request to `Get Presigned URL` API endpoint
let queryPath = `/v1/file/upload/get-presigned-url?contenttype=application/octet-stream&name=${path.basename(SourceFile)}`;
let reqOptions = {
host: "api.pdf.co",
path: encodeURI(queryPath),
headers: { "x-api-key": API_KEY }
};
// Send request
https.get(reqOptions, (response) => {
response.on("data", (d) => {
let data = JSON.parse(d);
if (data.error == false) {
// Return presigned url we received
resolve([data.presignedUrl, data.url]);
}
else {
// Service reported error
console.log("getPresignedUrl(): " + data.message);
}
});
})
.on("error", (e) => {
// Request error
console.log("getPresignedUrl(): " + e);
});
});
}
function uploadFile(apiKey, localFile, uploadUrl) {
return new Promise(resolve => {
fs.readFile(SourceFile, (err, data) => {
request({
method: "PUT",
url: uploadUrl,
body: data,
headers: {
"Content-Type": "application/octet-stream"
}
}, (err, res, body) => {
if (!err) {
resolve();
}
else {
console.log("uploadFile() request error: " + e);
}
});
});
});
}
function convertPdfToHtml(apiKey, uploadedFileUrl, password, pages, plainHtml, columnLayout, destinationFile) {
// Prepare request to `PDF To HTML` API endpoint
var queryPath = `/v1/pdf/convert/to/html`;
// JSON payload for api request
var jsonPayload = JSON.stringify({
name: path.basename(destinationFile), password: password, pages: pages, simple: plainHtml, columns: columnLayout, url: uploadedFileUrl
});
var reqOptions = {
host: "api.pdf.co",
method: "POST",
path: queryPath,
headers: {
"x-api-key": API_KEY,
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(jsonPayload, 'utf8')
}
};
// Send request
var postRequest = https.request(reqOptions, (response) => {
response.on("data", (d) => {
response.setEncoding("utf8");
// Parse JSON response
let data = JSON.parse(d);
if (data.error == false) {
// Download HTML file
var file = fs.createWriteStream(destinationFile);
https.get(data.url, (response2) => {
response2.pipe(file)
.on("close", () => {
console.log(`Generated HTML file saved as "${destinationFile}" file.`);
});
});
}
else {
// Service reported error
console.log("convertPdfToHtml(): " + data.message);
}
});
})
.on("error", (e) => {
// Request error
console.log("convertPdfToHtml(): " + e);
});
// Write request data
postRequest.write(jsonPayload);
postRequest.end();
}
import os
import requests # pip install requests
# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co
API_KEY = "***************************************"
# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"
# Source PDF file
SourceFile = ".\\sample.pdf"
# Comma-separated list of page indices (or ranges) to process. Leave empty for all pages. Example: '0,2-5,7-'.
Pages = ""
# PDF document password. Leave empty for unprotected documents.
Password = ""
# Destination Html file name
DestinationFile = ".\\result.html"
# Set to $true to get simplified HTML without CSS. Default is the rich HTML keeping the document design.
PlainHtml = False
# Set to $true if your document has the column layout like a newspaper.
ColumnLayout = False
def main(args = None):
uploadedFileUrl = uploadFile(SourceFile)
if (uploadedFileUrl != None):
convertPdfToHtml(uploadedFileUrl, DestinationFile)
def convertPdfToHtml(uploadedFileUrl, destinationFile):
"""Converts PDF To Html using PDF.co Web API"""
# Prepare requests params as JSON
# See documentation: https://developer.pdf.co/api/pdf-to-html/index.html
parameters = {}
parameters["name"] = os.path.basename(destinationFile)
parameters["password"] = Password
parameters["pages"] = Pages
parameters["simple"] = PlainHtml
parameters["columns"] = ColumnLayout
parameters["url"] = uploadedFileUrl
# Prepare URL for 'PDF To Html' API request
url = "{}/pdf/convert/to/html".format(BASE_URL)
# Execute request and get response as JSON
response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# Get URL of result file
resultFileUrl = json["url"]
# Download result file
r = requests.get(resultFileUrl, stream=True)
if (r.status_code == 200):
with open(destinationFile, 'wb') as file:
for chunk in r:
file.write(chunk)
print(f"Result file saved as \"{destinationFile}\" file.")
else:
print(f"Request error: {response.status_code} {response.reason}")
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
def uploadFile(fileName):
"""Uploads file to the cloud"""
# 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
# Prepare URL for 'Get Presigned URL' API request
url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
BASE_URL, os.path.basename(fileName))
# Execute request and get response as JSON
response = requests.get(url, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# URL to use for file upload
uploadUrl = json["presignedUrl"]
# URL for future reference
uploadedFileUrl = json["url"]
# 2. UPLOAD FILE TO CLOUD.
with open(fileName, 'rb') as file:
requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
return uploadedFileUrl
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
return None
if __name__ == '__main__':
main()
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
namespace PDFcoApiExample
{
class Program
{
// The authentication key (API Key).
// Get your own by registering at https://app.pdf.co
const String API_KEY = "***********************************";
// Source PDF file
const string SourceFile = @".\sample.pdf";
// Comma-separated list of page indices (or ranges) to process. Leave empty for all pages. Example: '0,2-5,7-'.
const string Pages = "";
// PDF document password. Leave empty for unprotected documents.
const string Password = "";
// Destination HTML file name
const string DestinationFile = @".\result.html";
// Set to `true` to get simplified HTML without CSS. Default is the rich HTML keeping the document design.
const bool PlainHtml = false;
// Set to `true` if your document has the column layout like a newspaper.
const bool ColumnLayout = false;
static void Main(string[] args)
{
// Create standard .NET web client instance
WebClient webClient = new WebClient();
// Set API Key
webClient.Headers.Add("x-api-key", API_KEY);
// 1. RETRIEVE THE PRESIGNED URL TO UPLOAD THE FILE.
// * If you already have the direct file URL, skip to the step 3.
// Prepare URL for `Get Presigned URL` API call
string query = Uri.EscapeUriString(string.Format(
"https://api.pdf.co/v1/file/upload/get-presigned-url?contenttype=application/octet-stream&name={0}",
Path.GetFileName(SourceFile)));
try
{
// Execute request
string response = webClient.DownloadString(query);
// Parse JSON response
JObject json = JObject.Parse(response);
if (json["error"].ToObject<bool>() == false)
{
// Get URL to use for the file upload
string uploadUrl = json["presignedUrl"].ToString();
string uploadedFileUrl = json["url"].ToString();
// 2. UPLOAD THE FILE TO CLOUD.
webClient.Headers.Add("content-type", "application/octet-stream");
webClient.UploadFile(uploadUrl, "PUT", SourceFile); // You can use UploadData() instead if your file is byte[] or Stream
webClient.Headers.Remove("content-type");
// 3. CONVERT UPLOADED PDF FILE TO HTML
// URL for `PDF To HTML` API call
var url = "https://api.pdf.co/v1/pdf/convert/to/html";
// Prepare requests params as JSON
Dictionary<string, object> parameters = new Dictionary<string, object>();
parameters.Add("name", Path.GetFileName(DestinationFile));
parameters.Add("password", Password);
parameters.Add("pages", Pages);
parameters.Add("simple", PlainHtml);
parameters.Add("columns", ColumnLayout);
parameters.Add("url", uploadedFileUrl);
// Convert dictionary of params to JSON
string jsonPayload = JsonConvert.SerializeObject(parameters);
// Execute POST request with JSON payload
response = webClient.UploadString(url, jsonPayload);
// Parse JSON response
json = JObject.Parse(response);
if (json["error"].ToObject<bool>() == false)
{
// Get URL of generated HTML file
string resultFileUrl = json["url"].ToString();
// Download HTML file
webClient.DownloadFile(resultFileUrl, DestinationFile);
Console.WriteLine("Generated HTML file saved as \"{0}\" file.", DestinationFile);
}
else
{
Console.WriteLine(json["message"].ToString());
}
}
else
{
Console.WriteLine(json["message"].ToString());
}
}
catch (WebException e)
{
Console.WriteLine(e.ToString());
}
webClient.Dispose();
Console.WriteLine();
Console.WriteLine("Press any key...");
Console.ReadKey();
}
}
}
package com.company;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import okhttp3.*;
import java.io.*;
import java.net.*;
import java.nio.file.Path;
import java.nio.file.Paths;
public class Main
{
// The authentication key (API Key).
// Get your own by registering at https://app.pdf.co
final static String API_KEY = "***********************************";
// Source PDF file
final static Path SourceFile = Paths.get(".\\sample.pdf");
// Comma-separated list of page indices (or ranges) to process. Leave empty for all pages. Example: '0,2-5,7-'.
final static String Pages = "";
// PDF document password. Leave empty for unprotected documents.
final static String Password = "";
// Destination HTML file name
final static Path DestinationFile = Paths.get(".\\result.html");
// Set to `true` to get simplified HTML without CSS. Default is the rich HTML keeping the document design.
final static boolean PlainHtml = false;
// Set to `true` if your document has the column layout like a newspaper.
final static boolean ColumnLayout = false;
public static void main(String[] args) throws IOException
{
// Create HTTP client instance
OkHttpClient webClient = new OkHttpClient();
// 1. RETRIEVE THE PRESIGNED URL TO UPLOAD THE FILE.
// * If you already have a direct file URL, skip to the step 3.
// Prepare URL for `Get Presigned URL` API call
String query = String.format(
"https://api.pdf.co/v1/file/upload/get-presigned-url?contenttype=application/octet-stream&name=%s",
SourceFile.getFileName());
// Prepare request
Request request = new Request.Builder()
.url(query)
.addHeader("x-api-key", API_KEY) // (!) Set API Key
.build();
// Execute request
Response response = webClient.newCall(request).execute();
if (response.code() == 200)
{
// Parse JSON response
JsonObject json = new JsonParser().parse(response.body().string()).getAsJsonObject();
boolean error = json.get("error").getAsBoolean();
if (!error)
{
// Get URL to use for the file upload
String uploadUrl = json.get("presignedUrl").getAsString();
// Get URL of uploaded file to use with later API calls
String uploadedFileUrl = json.get("url").getAsString();
// 2. UPLOAD THE FILE TO CLOUD.
if (uploadFile(webClient, API_KEY, uploadUrl, SourceFile))
{
// 3. CONVERT UPLOADED PDF FILE TO HTML
PdfToHtml(webClient, API_KEY, DestinationFile, Password, Pages, PlainHtml, ColumnLayout, uploadedFileUrl);
}
}
else
{
// Display service reported error
System.out.println(json.get("message").getAsString());
}
}
else
{
// Display request error
System.out.println(response.code() + " " + response.message());
}
}
public static void PdfToHtml(OkHttpClient webClient, String apiKey, Path destinationFile,
String password, String pages, boolean plainHtml, boolean columnLayout, String uploadedFileUrl) throws IOException
{
// Prepare URL for `PDF To HTML` API call
String query = "https://api.pdf.co/v1/pdf/convert/to/html";
// Make correctly escaped (encoded) URL
URL url = null;
try
{
url = new URI(null, query, null).toURL();
}
catch (URISyntaxException e)
{
e.printStackTrace();
}
// Create JSON payload
String jsonPayload = String.format("{\"name\": \"%s\", \"password\": \"%s\", \"pages\": \"%s\", \"simple\": \"%s\", \"columns\": \"%s\", \"url\": \"%s\"}",
destinationFile.getFileName(),
password,
pages,
plainHtml,
columnLayout,
uploadedFileUrl);
// Prepare request body
RequestBody body = RequestBody.create(MediaType.parse("application/json"), jsonPayload);
// Prepare request
Request request = new Request.Builder()
.url(url)
.addHeader("x-api-key", API_KEY) // (!) Set API Key
.addHeader("Content-Type", "application/json")
.post(body)
.build();
// Execute request
Response response = webClient.newCall(request).execute();
if (response.code() == 200)
{
// Parse JSON response
JsonObject json = new JsonParser().parse(response.body().string()).getAsJsonObject();
boolean error = json.get("error").getAsBoolean();
if (!error)
{
// Get URL of generated HTML file
String resultFileUrl = json.get("url").getAsString();
// Download HTML file
downloadFile(webClient, resultFileUrl, destinationFile.toFile());
System.out.printf("Generated HTML file saved as \"%s\" file.", destinationFile.toString());
}
else
{
// Display service reported error
System.out.println(json.get("message").getAsString());
}
}
else
{
// Display request error
System.out.println(response.code() + " " + response.message());
}
}
public static boolean uploadFile(OkHttpClient webClient, String apiKey, String url, Path sourceFile) throws IOException
{
// Prepare request body
RequestBody body = RequestBody.create(MediaType.parse("application/octet-stream"), sourceFile.toFile());
// Prepare request
Request request = new Request.Builder()
.url(url)
.addHeader("x-api-key", apiKey) // (!) Set API Key
.addHeader("content-type", "application/octet-stream")
.put(body)
.build();
// Execute request
Response response = webClient.newCall(request).execute();
return (response.code() == 200);
}
public static void downloadFile(OkHttpClient webClient, String url, File destinationFile) throws IOException
{
// Prepare request
Request request = new Request.Builder()
.url(url)
.build();
// Execute request
Response response = webClient.newCall(request).execute();
byte[] fileBytes = response.body().bytes();
// Save downloaded bytes to file
OutputStream output = new FileOutputStream(destinationFile);
output.write(fileBytes);
output.flush();
output.close();
response.close();
}
}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>PDF To HTML Extraction Results</title>
</head>
<body>
<?php
// Note: If you have input files large than 200kb we highly recommend to check "async" mode example.
// Get submitted form data
$apiKey = $_POST["apiKey"]; // The authentication key (API Key). Get your own by registering at https://app.pdf.co
$pages = "";
if(isset($_POST["pages"])){
$pages = $_POST["pages"];
}
$plainHtml = false;
if(isset($_POST["plainHtml"])){
$plainHtml = $_POST["plainHtml"];
}
$columnLayout = false;
if(isset($_POST["columnLayout"])){
$columnLayout = $_POST["columnLayout"];
}
// 1. RETRIEVE THE PRESIGNED URL TO UPLOAD THE FILE.
// * If you already have the direct PDF file link, go to the step 3.
// Create URL
$url = "https://api.pdf.co/v1/file/upload/get-presigned-url" .
"?name=" . urlencode($_FILES["file"]["name"]) .
"&contenttype=application/octet-stream";
// Create request
$curl = curl_init();
curl_setopt($curl, CURLOPT_HTTPHEADER, array("x-api-key: " . $apiKey));
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
// Execute request
$result = curl_exec($curl);
if (curl_errno($curl) == 0)
{
$status_code = curl_getinfo($curl, CURLINFO_HTTP_CODE);
if ($status_code == 200)
{
$json = json_decode($result, true);
// Get URL to use for the file upload
$uploadFileUrl = $json["presignedUrl"];
// Get URL of uploaded file to use with later API calls
$uploadedFileUrl = $json["url"];
// 2. UPLOAD THE FILE TO CLOUD.
$localFile = $_FILES["file"]["tmp_name"];
$fileHandle = fopen($localFile, "r");
curl_setopt($curl, CURLOPT_URL, $uploadFileUrl);
curl_setopt($curl, CURLOPT_HTTPHEADER, array("content-type: application/octet-stream"));
curl_setopt($curl, CURLOPT_PUT, true);
curl_setopt($curl, CURLOPT_INFILE, $fileHandle);
curl_setopt($curl, CURLOPT_INFILESIZE, filesize($localFile));
// Execute request
curl_exec($curl);
fclose($fileHandle);
if (curl_errno($curl) == 0)
{
$status_code = curl_getinfo($curl, CURLINFO_HTTP_CODE);
if ($status_code == 200)
{
// 3. CONVERT UPLOADED PDF FILE TO HTML
PdfToHtml($apiKey, $uploadedFileUrl, $pages, $plainHtml, $columnLayout);
}
else
{
// Display request error
echo "<p>Status code: " . $status_code . "</p>";
echo "<p>" . $result . "</p>";
}
}
else
{
// Display CURL error
echo "Error: " . curl_error($curl);
}
}
else
{
// Display service reported error
echo "<p>Status code: " . $status_code . "</p>";
echo "<p>" . $result . "</p>";
}
curl_close($curl);
}
else
{
// Display CURL error
echo "Error: " . curl_error($curl);
}
function PdfToHtml($apiKey, $uploadedFileUrl, $pages, $plainHtml, $columnLayout)
{
// Create URL
$url = "https://api.pdf.co/v1/pdf/convert/to/html";
// Prepare requests params
$parameters = array();
$parameters["url"] = $uploadedFileUrl;
$parameters["pages"] = $pages;
if($plainHtml){
$parameters["simple"] = $plainHtml;
}
if($columnLayout){
$parameters["columns"] = $columnLayout;
}
// Create Json payload
$data = json_encode($parameters);
// Create request
$curl = curl_init();
curl_setopt($curl, CURLOPT_HTTPHEADER, array("x-api-key: " . $apiKey, "Content-type: application/json"));
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_POST, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_POSTFIELDS, $data);
// Execute request
$result = curl_exec($curl);
if (curl_errno($curl) == 0)
{
$status_code = curl_getinfo($curl, CURLINFO_HTTP_CODE);
if ($status_code == 200)
{
$json = json_decode($result, true);
if (!isset($json["error"]) || $json["error"] == false)
{
$resultFileUrl = $json["url"];
// Display link to the file with conversion results
echo "<div><h2>Conversion Result:</h2><a href='" . $resultFileUrl . "' target='_blank'>" . $resultFileUrl . "</a></div>";
}
else
{
// Display service reported error
echo "<p>Error: " . $json["message"] . "</p>";
}
}
else
{
// Display request error
echo "<p>Status code: " . $status_code . "</p>";
echo "<p>" . $result . "</p>";
}
}
else
{
// Display CURL error
echo "Error: " . curl_error($curl);
}
// Cleanup
curl_close($curl);
}
?>
</body>
</html>
On Github#
Footnotes
- 1
Supports publicly accessible links from any source, including Google Drive, Dropbox, and PDF.co Built-In Files Storage. The PDF.co API only accepts URLs as file inputs, not direct file uploads. If your file is stored locally or not publicly accessible, you must upload it using the File Upload endpoints to get a publicly accessible URL. Note: If you experience intermittent Access Denied or Too Many Requests errors, please try adding
cache:
to enable built-in URL caching (e.g.,cache:https://example.com/file1.pdf
). For data security, you have the option to encrypt output files and decrypt input files. Learn more about user-controlled data encryption.- 2
Main response codes as follows:
Code
Description
200
Success
400
Bad request. Typically happens because of bad input parameters, or because the input URLs can’t be reached, possibly due to access restrictions like needing a login or password.
401
Unauthorized
402
Not enough credits
445
Timeout error. To process large documents or files please use asynchronous mode (set the
async
parameter totrue
) and then check status using the /job/check endpoint. If a file contains many pages then specify a page range using thepages
parameter. The number of pages of the document can be obtained using the /pdf/info endpoint.Note
For more see the complete list of available response codes.
- 3
PDF.co Request size: API requests do not support request sizes of more than
4
megabytes in size. Please ensure that request sizes do not exceed this limit.