Skip to content

JSON Extraction

This guide covers JSON extraction for REST APIs and JSON responses.

Basic Setup

name = "myapi"
start_urls = ["https://api.example.com/items?page=1"]

[extract]
type = "json"

[extract.items]
path = "data.items"  # Path to items array

[extract.items.fields]
id = "id"
title = "name"
price = "pricing.amount"

Dot-Notation Paths

JSON extraction uses dot-notation to navigate nested structures:

{
  "data": {
    "items": [
      {
        "id": 1,
        "name": "Product",
        "pricing": {
          "amount": 99.99,
          "currency": "USD"
        }
      }
    ]
  }
}
[extract.items]
path = "data.items"  # Navigate to the array

[extract.items.fields]
id = "id"                    # data.items[].id
title = "name"               # data.items[].name
price = "pricing.amount"     # data.items[].pricing.amount
currency = "pricing.currency"

Items Configuration

path

Path to the items array (or single item):

[extract.items]
path = "data.items"    # Array at data.items
# path = "results"     # Array at top level
# path = ""            # Full response is the item

No Fields (Full Export)

If fields is not specified, entire items are exported as-is:

[extract.items]
path = "data.items"
# No fields = export full item objects

id

Path to unique identifier for deduplication:

[extract.items]
path = ""
id = "listing.ListingID"  # Nested path for detail responses

Field Configuration

Simple Path

[extract.items.fields]
id = "id"
title = "name"

Full Field Config

[extract.items.fields]
id = { path = "id" }
title = { path = "name", required = true }
price = { path = "pricing.amount", parser = "parse_float" }

Field Options

Option Type Default Description
path string required Dot-notation path to the value
parser string null Parser to transform the value
required bool false Fail item if field is missing

Advanced Path Features

Array Indexing

Access specific array indices:

[extract.items.fields]
first_image = "images.0"         # First image
second_tag = "tags.1"            # Second tag

Wildcards

Extract from all array items:

[extract.items.fields]
all_prices = "variants.*.price"  # Price from all variants
# Result: [10.99, 12.99, 15.99]

For APIs with next page URLs:

[extract.links]
pagination = [
    "links.next",
    "meta.next_page_url",
    "pagination.next",
]

Multiple paths are checked in order; first non-null value is used.

Item URLs from IDs

Build detail URLs from item IDs in listing responses:

[extract.links]
items_path = "data.items"           # Path to items array
items_id = "id"                     # ID field in each item
items_url = "https://api.example.com/items/{id}"  # URL template

This generates URLs like: - https://api.example.com/items/1 - https://api.example.com/items/2 - etc.

Common API Patterns

Paginated List API

{
  "data": [...],
  "links": {
    "next": "https://api.example.com/items?page=2"
  }
}
[extract.items]
path = "data"

[extract.links]
pagination = ["links.next"]

List + Detail Pattern

List endpoint returns IDs, detail endpoint returns full data:

# Start with list endpoint
start_urls = ["https://api.example.com/items"]

[extract]
type = "json"
items_from = "item"  # Only save items from detail pages

# Full response from detail pages
[extract.items]
path = ""
id = "data.id"

[extract.links]
# Pagination on list pages
pagination = ["links.next"]

# Build detail URLs from list response
items_path = "data"
items_id = "id"
items_url = "https://api.example.com/items/{id}"

Cursor-Based Pagination

{
  "items": [...],
  "cursor": "abc123",
  "has_more": true
}

For cursor-based pagination, you'll need to construct the URL. Use middleware for complex cases.

Offset-Based Pagination

{
  "results": [...],
  "total": 1000,
  "offset": 0,
  "limit": 20
}

Generate start URLs for all pages:

# urls.txt
https://api.example.com/items?offset=0&limit=20
https://api.example.com/items?offset=20&limit=20
https://api.example.com/items?offset=40&limit=20
# ... etc
start_urls = { file = "urls.txt" }

[extract.items]
path = "results"

Using Parsers

[extract.items.fields]
price = { path = "price", parser = "parse_float" }
created = { path = "created_at", parser = "strip" }
tags = { path = "tags_string", parser = "parse_json" }

Complete Example

Real-world example for a classifieds API:

name = "classifieds"
start_urls = { file = "category_urls.txt" }

[extract]
type = "json"
items_from = "item"  # Only save from detail pages

# Full detail response
[extract.items]
path = ""  # Full JSON response
id = "listing.ListingID"

# Links from listing pages
[extract.links]
# Next page URL
pagination = ["listings.next_page_url"]

# Build detail URLs from listing data
items_path = "listings.data"
items_id = "ListingID"
items_url = "https://api.example.com/classifieds/{id}"

[policy]
concurrency = 8
delay = 1.0
jitter = 0.2

[fetch.headers]
Content-Type = "application/json"
User-Agent = "MyApp/1.0"

Sample Response Structures

Listing page (/api/listings?category=1):

{
  "listings": {
    "data": [
      {"ListingID": 1, "Title": "Item 1"},
      {"ListingID": 2, "Title": "Item 2"}
    ],
    "next_page_url": "https://api.example.com/listings?category=1&page=2"
  }
}

Detail page (/api/classifieds/1):

{
  "listing": {
    "ListingID": 1,
    "Title": "Item 1",
    "Description": "...",
    "Price": 100.00,
    "Images": [...]
  },
  "category": {...},
  "seller": {...}
}

The entire detail response is saved as one item.