Some checks failed
ci / ci (22, ubuntu-latest) (push) Has been cancelled
Nuxt 4 + Supabase + Flightics API. Incluye búsqueda de vuelos, inspiraciones, watchlist, tracking de precios y mapa interactivo. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
521 lines
18 KiB
TypeScript
521 lines
18 KiB
TypeScript
/**
|
|
* Descubre URLs de reserva de aerolineas usando Playwright.
|
|
*
|
|
* Estrategia por orden de prioridad:
|
|
* 1. Buscar links en el HTML con codigos IATA → extraer template de URL
|
|
* 2. Interceptar pushState/replaceState al interactuar con formulario
|
|
* 3. Capturar requests de red con parametros de busqueda
|
|
* 4. Fallback: guardar la booking page URL sin template
|
|
*
|
|
* Ejecutar: npx tsx scripts/discover-booking-urls.ts --offset 0 --limit 100
|
|
* Test: npx tsx scripts/discover-booking-urls.ts --iata KL,BA,FR
|
|
* 2nd pass: npx tsx scripts/discover-booking-urls.ts --retry-failed --offset 0 --limit 1020
|
|
*/
|
|
|
|
import { chromium, type Browser, type Page } from 'playwright'
|
|
import { createClient } from '@supabase/supabase-js'
|
|
import { parseArgs } from 'node:util'
|
|
|
|
// --- Config ---
|
|
const SUPABASE_URL = process.env.SUPABASE_URL || 'http://localhost:8000'
|
|
const SUPABASE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY || ''
|
|
const SITE_TIMEOUT = 20_000
|
|
const NAV_TIMEOUT = 15_000
|
|
|
|
// Well-known IATA airport codes used to detect search URL patterns in links
|
|
// If any of these appear in an href, the link likely reveals the search URL format
|
|
const KNOWN_IATA_CODES = new Set([
|
|
'AMS', 'LHR', 'CDG', 'FRA', 'MAD', 'BCN', 'FCO', 'MXP', 'IST', 'ATH',
|
|
'JFK', 'LAX', 'MIA', 'SFO', 'ORD', 'BOS', 'ATL', 'DFW', 'SEA', 'DEN',
|
|
'NRT', 'HND', 'ICN', 'PEK', 'PVG', 'HKG', 'SIN', 'BKK', 'DEL', 'BOM',
|
|
'DXB', 'DOH', 'CAI', 'JNB', 'NBO', 'ADD', 'CMN', 'ALG', 'LOS', 'ACC',
|
|
'GRU', 'EZE', 'BOG', 'LIM', 'SCL', 'MEX', 'CUN', 'PTY', 'SJO', 'HAV',
|
|
'SYD', 'MEL', 'AKL', 'NYC', 'LON', 'PAR', 'TYO', 'ROM', 'MIL',
|
|
])
|
|
|
|
// Booking-related href keywords
|
|
const BOOKING_HREF_PATTERNS = [
|
|
'book', 'booking', 'flight', 'search', 'reserv', 'vuelo',
|
|
'fly', 'ticket', 'trip', 'travel', 'buy', 'fare', 'offer'
|
|
]
|
|
|
|
const BOOKING_HREF_SELECTOR = BOOKING_HREF_PATTERNS
|
|
.map(p => `a[href*="${p}" i]`)
|
|
.join(', ')
|
|
|
|
// Selectors for search form inputs
|
|
const ORIGIN_SELECTORS = [
|
|
'input[name*="origin" i]', 'input[name*="from" i]', 'input[name*="departure" i]',
|
|
'input[name*="depart" i]', 'input[name*="salida" i]', 'input[name*="origen" i]',
|
|
'input[placeholder*="from" i]', 'input[placeholder*="origin" i]',
|
|
'input[placeholder*="departure" i]', 'input[placeholder*="desde" i]',
|
|
'input[placeholder*="origen" i]', 'input[placeholder*="salida" i]',
|
|
'input[aria-label*="from" i]', 'input[aria-label*="origin" i]',
|
|
'input[aria-label*="departure" i]', 'input[aria-label*="desde" i]',
|
|
'input[id*="origin" i]', 'input[id*="from" i]', 'input[id*="depart" i]',
|
|
]
|
|
|
|
const DEST_SELECTORS = [
|
|
'input[name*="destination" i]', 'input[name*="to" i]', 'input[name*="arrival" i]',
|
|
'input[name*="arriv" i]', 'input[name*="destino" i]', 'input[name*="llegada" i]',
|
|
'input[placeholder*="to" i]', 'input[placeholder*="destination" i]',
|
|
'input[placeholder*="arrival" i]', 'input[placeholder*="hacia" i]',
|
|
'input[placeholder*="destino" i]', 'input[placeholder*="llegada" i]',
|
|
'input[aria-label*="to" i]', 'input[aria-label*="destination" i]',
|
|
'input[aria-label*="arrival" i]', 'input[aria-label*="destino" i]',
|
|
'input[id*="destination" i]', 'input[id*="to" i]', 'input[id*="arriv" i]',
|
|
]
|
|
|
|
const SEARCH_BUTTON_SELECTORS = [
|
|
'button[type="submit"]',
|
|
'button:has-text("Search")', 'button:has-text("Buscar")',
|
|
'button:has-text("Book")', 'button:has-text("Find")',
|
|
'button:has-text("Reservar")', 'button:has-text("Buscar vuelos")',
|
|
'button:has-text("Search flights")', 'button:has-text("Find flights")',
|
|
'a:has-text("Search")', 'a:has-text("Buscar")',
|
|
'input[type="submit"]',
|
|
]
|
|
|
|
// --- Supabase ---
|
|
const supabase = createClient(SUPABASE_URL, SUPABASE_KEY)
|
|
|
|
interface Airline {
|
|
iata: string
|
|
name: string
|
|
website: string
|
|
}
|
|
|
|
interface DiscoveryResult {
|
|
iata: string
|
|
bookingUrl: string | null
|
|
bookingUrlTemplate: string | null
|
|
method?: string // how the template was discovered
|
|
error?: string
|
|
}
|
|
|
|
// --- Template extraction from links ---
|
|
|
|
/**
|
|
* Scan all links on the page for URLs containing IATA airport codes.
|
|
* These destination links reveal the search URL pattern.
|
|
* Returns a template with {origin} and {destination} placeholders.
|
|
*/
|
|
async function extractTemplateFromLinks(page: Page, baseUrl: string): Promise<{ template: string; bookingUrl: string } | null> {
|
|
try {
|
|
const links = await page.$$eval('a[href]', (els) => {
|
|
return els.map(el => ({
|
|
href: el.getAttribute('href') || '',
|
|
text: (el.textContent || '').trim().slice(0, 100),
|
|
}))
|
|
})
|
|
|
|
for (const link of links) {
|
|
const href = link.href
|
|
if (!href || href.length < 10 || href.length > 500) continue
|
|
|
|
// Must look like a FLIGHT search/booking URL
|
|
if (!href.match(/search|book|flight|reserv|offer|fare|vuelo|select/i)) continue
|
|
// Exclude non-flight links (cars, hotels, guides, insurance, etc.)
|
|
if (href.match(/car[s.]|hotel|guide|insurance|lounge|cargo|club|baggage|checkin|check-in|status|manage/i)) continue
|
|
|
|
// Find known IATA airport codes in the URL
|
|
const decoded = decodeURIComponent(href)
|
|
const threeLetterWords = [...decoded.matchAll(/\b([A-Z]{3})\b/g)].map(m => m[1])
|
|
const foundCodes = threeLetterWords.filter(c => KNOWN_IATA_CODES.has(c))
|
|
|
|
// Need at least one real airport code
|
|
if (foundCodes.length < 1) continue
|
|
|
|
let template = href
|
|
const resolvedUrl = resolveUrl(baseUrl, href)
|
|
const uniqueCodes = [...new Set(foundCodes)]
|
|
|
|
if (uniqueCodes.length >= 2) {
|
|
// Two codes: first = origin, second = destination
|
|
template = template.replace(new RegExp(`\\b${uniqueCodes[0]}\\b`), '{origin}')
|
|
template = template.replace(new RegExp(`\\b${uniqueCodes[1]}\\b`), '{destination}')
|
|
// Handle round-trip (origin repeated at end)
|
|
template = template.replace(new RegExp(`\\b${uniqueCodes[0]}\\b`), '{origin}')
|
|
} else {
|
|
// Single code — likely a destination-only link from the homepage
|
|
template = template.replace(new RegExp(`\\b${uniqueCodes[0]}\\b`, 'g'), '{destination}')
|
|
}
|
|
|
|
// Replace passenger counts in query params
|
|
template = template.replace(/(?<=[=:])1(?=[&:,\s]|$)/g, '{passengers}')
|
|
|
|
if (template.includes('{destination}') || template.includes('{origin}')) {
|
|
const resolvedTemplate = resolveUrl(baseUrl, template)
|
|
return {
|
|
template: resolvedTemplate,
|
|
bookingUrl: resolvedUrl,
|
|
}
|
|
}
|
|
}
|
|
} catch {
|
|
// DOM query failed
|
|
}
|
|
return null
|
|
}
|
|
|
|
// --- Helper functions ---
|
|
|
|
function resolveUrl(base: string, href: string): string {
|
|
try {
|
|
return new URL(href, base).toString()
|
|
} catch {
|
|
return href
|
|
}
|
|
}
|
|
|
|
async function findBookingLink(page: Page): Promise<string | null> {
|
|
try {
|
|
const links = await page.$$(BOOKING_HREF_SELECTOR)
|
|
|
|
for (const link of links) {
|
|
const href = await link.getAttribute('href')
|
|
const text = (await link.textContent())?.toLowerCase() || ''
|
|
const isVisible = await link.isVisible().catch(() => false)
|
|
|
|
if (!href || !isVisible) continue
|
|
if (href.includes('career') || href.includes('about') || href.includes('press') || href.includes('blog')) continue
|
|
if (text.match(/book|reserv|search|buscar|vuelo|flight|fly|ticket|buy/i) || href.match(/book|reserv|search|flight/i)) {
|
|
return href
|
|
}
|
|
}
|
|
|
|
if (links.length > 0) {
|
|
return await links[0].getAttribute('href')
|
|
}
|
|
} catch {}
|
|
return null
|
|
}
|
|
|
|
async function findAndFillInput(page: Page, selectors: string[], value: string): Promise<boolean> {
|
|
for (const sel of selectors) {
|
|
try {
|
|
const el = await page.$(sel)
|
|
if (el && await el.isVisible().catch(() => false)) {
|
|
await el.click()
|
|
await el.fill(value)
|
|
await page.waitForTimeout(500)
|
|
await el.press('Enter').catch(() => {})
|
|
return true
|
|
}
|
|
} catch { continue }
|
|
}
|
|
return false
|
|
}
|
|
|
|
async function clickSearchButton(page: Page): Promise<boolean> {
|
|
for (const sel of SEARCH_BUTTON_SELECTORS) {
|
|
try {
|
|
const btn = await page.$(sel)
|
|
if (btn && await btn.isVisible().catch(() => false)) {
|
|
await btn.click()
|
|
return true
|
|
}
|
|
} catch { continue }
|
|
}
|
|
return false
|
|
}
|
|
|
|
function buildTemplateFromUrl(url: string, origin: string, destination: string, dateIso: string): string {
|
|
let template = url
|
|
const dateCompact = dateIso.replace(/-/g, '')
|
|
const dateDMY = dateIso.split('-').reverse().join('/')
|
|
|
|
template = template.replaceAll(dateIso, '{date}')
|
|
template = template.replaceAll(dateDMY, '{date}')
|
|
template = template.replaceAll(dateCompact, '{date}')
|
|
template = template.replaceAll(encodeURIComponent(dateIso), '{date}')
|
|
template = template.replaceAll(encodeURIComponent(dateDMY), '{date}')
|
|
|
|
template = template.replace(new RegExp(origin, 'gi'), '{origin}')
|
|
template = template.replace(new RegExp(destination, 'gi'), '{destination}')
|
|
template = template.replace(/madrid/gi, '{origin}')
|
|
template = template.replace(/london/gi, '{destination}')
|
|
template = template.replace(/londres/gi, '{destination}')
|
|
template = template.replace(/heathrow/gi, '{destination}')
|
|
|
|
const urlParts = template.split('?')
|
|
if (urlParts[1]) {
|
|
urlParts[1] = urlParts[1].replace(/(?<=[=:])1(?=[&:,\s]|$)/g, '{passengers}')
|
|
template = urlParts.join('?')
|
|
}
|
|
|
|
return template
|
|
}
|
|
|
|
// --- Main discovery function ---
|
|
|
|
async function discoverAirline(browser: Browser, airline: Airline): Promise<DiscoveryResult> {
|
|
const result: DiscoveryResult = {
|
|
iata: airline.iata,
|
|
bookingUrl: null,
|
|
bookingUrlTemplate: null
|
|
}
|
|
|
|
let page: Page | null = null
|
|
try {
|
|
page = await browser.newPage({
|
|
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36',
|
|
viewport: { width: 1280, height: 720 },
|
|
locale: 'es-ES',
|
|
})
|
|
page.setDefaultTimeout(SITE_TIMEOUT)
|
|
|
|
// Mask webdriver + capture pushState/replaceState
|
|
await page.addInitScript(() => {
|
|
Object.defineProperty(navigator, 'webdriver', { get: () => false })
|
|
;(window as any).__capturedUrls = []
|
|
const origPush = history.pushState.bind(history)
|
|
const origReplace = history.replaceState.bind(history)
|
|
history.pushState = (...args: any[]) => {
|
|
;(window as any).__capturedUrls.push(args[2])
|
|
return origPush(...args)
|
|
}
|
|
history.replaceState = (...args: any[]) => {
|
|
;(window as any).__capturedUrls.push(args[2])
|
|
return origReplace(...args)
|
|
}
|
|
})
|
|
|
|
// Capture search-related network requests
|
|
const searchRequests: string[] = []
|
|
page.on('request', req => {
|
|
const url = req.url()
|
|
if (req.resourceType() === 'xhr' || req.resourceType() === 'fetch') {
|
|
if (url.match(/search|book|flight|offer|avail|fare/i)) {
|
|
searchRequests.push(url)
|
|
}
|
|
}
|
|
})
|
|
|
|
// Step 1: Navigate to airline website
|
|
await page.goto(airline.website, { waitUntil: 'domcontentloaded', timeout: SITE_TIMEOUT })
|
|
await page.waitForTimeout(3000)
|
|
|
|
const startUrl = page.url()
|
|
|
|
// ========================================
|
|
// Strategy 1: Extract template from links
|
|
// ========================================
|
|
const linkTemplate = await extractTemplateFromLinks(page, startUrl)
|
|
if (linkTemplate) {
|
|
result.bookingUrlTemplate = linkTemplate.template
|
|
result.bookingUrl = linkTemplate.bookingUrl
|
|
result.method = 'link-template'
|
|
return result
|
|
}
|
|
|
|
// ========================================
|
|
// Strategy 2: Find booking page + try form interaction
|
|
// ========================================
|
|
const bookingHref = await findBookingLink(page)
|
|
|
|
if (bookingHref) {
|
|
const bookingUrl = resolveUrl(startUrl, bookingHref)
|
|
result.bookingUrl = bookingUrl
|
|
|
|
try {
|
|
await page.goto(bookingUrl, { waitUntil: 'domcontentloaded', timeout: NAV_TIMEOUT })
|
|
await page.waitForTimeout(2000)
|
|
} catch {
|
|
// Navigation failed, but we have the URL
|
|
}
|
|
|
|
// Check links on booking page too
|
|
const bookingPageTemplate = await extractTemplateFromLinks(page, page.url())
|
|
if (bookingPageTemplate) {
|
|
result.bookingUrlTemplate = bookingPageTemplate.template
|
|
result.bookingUrl = bookingPageTemplate.bookingUrl
|
|
result.method = 'booking-page-link-template'
|
|
return result
|
|
}
|
|
} else {
|
|
result.bookingUrl = startUrl
|
|
}
|
|
|
|
// ========================================
|
|
// Strategy 3: Fill form + capture URL change / pushState
|
|
// ========================================
|
|
const testOrigin = 'MAD'
|
|
const testDest = 'LHR'
|
|
const testDate = (() => {
|
|
const d = new Date()
|
|
d.setDate(d.getDate() + 30)
|
|
return d.toISOString().slice(0, 10)
|
|
})()
|
|
|
|
const filledOrigin = await findAndFillInput(page, ORIGIN_SELECTORS, testOrigin)
|
|
const filledDest = await findAndFillInput(page, DEST_SELECTORS, testDest)
|
|
|
|
if (filledOrigin || filledDest) {
|
|
const urlBefore = page.url()
|
|
|
|
const clicked = await clickSearchButton(page)
|
|
if (clicked) {
|
|
// Wait for navigation or pushState
|
|
await page.waitForTimeout(5000)
|
|
|
|
// Check 3a: URL changed (traditional navigation)
|
|
const urlAfter = page.url()
|
|
if (urlAfter !== urlBefore) {
|
|
result.bookingUrlTemplate = buildTemplateFromUrl(urlAfter, testOrigin, testDest, testDate)
|
|
result.bookingUrl = urlAfter
|
|
result.method = 'form-url-change'
|
|
return result
|
|
}
|
|
|
|
// Check 3b: pushState/replaceState captured
|
|
const captured: string[] = await page.evaluate(() => (window as any).__capturedUrls || [])
|
|
const relevantCapture = captured.find(u =>
|
|
typeof u === 'string' && (u.includes(testOrigin) || u.includes(testDest) || u.match(/search|book|flight/i))
|
|
)
|
|
if (relevantCapture) {
|
|
const fullUrl = resolveUrl(page.url(), relevantCapture)
|
|
result.bookingUrlTemplate = buildTemplateFromUrl(fullUrl, testOrigin, testDest, testDate)
|
|
result.bookingUrl = fullUrl
|
|
result.method = 'pushstate'
|
|
return result
|
|
}
|
|
|
|
// Check 3c: Network requests with search params
|
|
const relevantRequest = searchRequests.find(u =>
|
|
u.includes(testOrigin) || u.includes(testDest)
|
|
)
|
|
if (relevantRequest) {
|
|
result.bookingUrlTemplate = buildTemplateFromUrl(relevantRequest, testOrigin, testDest, testDate)
|
|
result.bookingUrl = relevantRequest
|
|
result.method = 'network-request'
|
|
return result
|
|
}
|
|
}
|
|
}
|
|
} catch (err: any) {
|
|
result.error = err.message?.slice(0, 200)
|
|
} finally {
|
|
await page?.close().catch(() => {})
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// --- Main ---
|
|
|
|
async function main() {
|
|
const { values } = parseArgs({
|
|
options: {
|
|
offset: { type: 'string', default: '0' },
|
|
limit: { type: 'string', default: '100' },
|
|
iata: { type: 'string' },
|
|
'retry-failed': { type: 'boolean', default: false },
|
|
}
|
|
})
|
|
|
|
const offset = parseInt(values.offset!)
|
|
const limit = parseInt(values.limit!)
|
|
const iataCodes = values.iata?.split(',').map(s => s.trim().toUpperCase())
|
|
const retryFailed = values['retry-failed']
|
|
|
|
if (iataCodes) {
|
|
console.log(`[discover] Starting with specific airlines: ${iataCodes.join(', ')}`)
|
|
} else if (retryFailed) {
|
|
console.log(`[discover] Retrying airlines without template, offset=${offset} limit=${limit}`)
|
|
} else {
|
|
console.log(`[discover] Starting offset=${offset} limit=${limit}`)
|
|
}
|
|
|
|
// Fetch airlines from Supabase
|
|
let query = supabase
|
|
.from('airlines')
|
|
.select('iata, name, website')
|
|
.not('website', 'is', null)
|
|
.order('iata')
|
|
|
|
if (iataCodes) {
|
|
query = query.in('iata', iataCodes)
|
|
} else {
|
|
if (retryFailed) {
|
|
// Only process airlines that have no template yet
|
|
query = query.is('booking_url_template', null)
|
|
}
|
|
query = query.range(offset, offset + limit - 1)
|
|
}
|
|
|
|
const { data: airlines, error } = await query
|
|
|
|
if (error) {
|
|
console.error('[discover] Failed to fetch airlines:', error.message)
|
|
process.exit(1)
|
|
}
|
|
|
|
if (!airlines?.length) {
|
|
console.log('[discover] No airlines to process')
|
|
process.exit(0)
|
|
}
|
|
|
|
console.log(`[discover] Processing ${airlines.length} airlines`)
|
|
|
|
const browser = await chromium.launch({
|
|
headless: true,
|
|
args: [
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-http2',
|
|
]
|
|
})
|
|
|
|
let discovered = 0
|
|
let withTemplate = 0
|
|
const failed: string[] = []
|
|
|
|
for (let i = 0; i < airlines.length; i++) {
|
|
const airline = airlines[i] as Airline
|
|
const progress = `[${i + 1}/${airlines.length}]`
|
|
|
|
try {
|
|
const result = await discoverAirline(browser, airline)
|
|
|
|
if (result.bookingUrl || result.bookingUrlTemplate) {
|
|
const update: Record<string, any> = {
|
|
booking_url_discovered_at: new Date().toISOString()
|
|
}
|
|
if (result.bookingUrl) update.booking_url = result.bookingUrl
|
|
if (result.bookingUrlTemplate) update.booking_url_template = result.bookingUrlTemplate
|
|
|
|
await supabase
|
|
.from('airlines')
|
|
.update(update)
|
|
.eq('iata', airline.iata)
|
|
|
|
discovered++
|
|
if (result.bookingUrlTemplate) withTemplate++
|
|
|
|
const methodTag = result.method ? ` [${result.method}]` : ''
|
|
console.log(`${progress} ${airline.iata} (${airline.name}): OK${result.bookingUrlTemplate ? ' +template' : ''}${methodTag} -> ${result.bookingUrl?.slice(0, 120)}`)
|
|
} else {
|
|
failed.push(airline.iata)
|
|
console.log(`${progress} ${airline.iata} (${airline.name}): SKIP${result.error ? ` (${result.error.slice(0, 80)})` : ''}`)
|
|
}
|
|
} catch (err: any) {
|
|
failed.push(airline.iata)
|
|
console.log(`${progress} ${airline.iata} (${airline.name}): ERROR ${err.message?.slice(0, 80)}`)
|
|
}
|
|
}
|
|
|
|
await browser.close()
|
|
|
|
console.log(`\n[discover] Done: ${discovered} discovered, ${withTemplate} with template, ${failed.length} failed`)
|
|
if (failed.length > 0) console.log(`[discover] Failed: ${failed.join(', ')}`)
|
|
|
|
console.log(JSON.stringify({ discovered, withTemplate, failed: failed.length, failedCodes: failed }))
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('[discover] Fatal:', err)
|
|
process.exit(1)
|
|
})
|