refactor: move to using RSS

This commit is contained in:
Derrick Hammer 2023-12-24 08:16:23 -05:00
parent 368e42e709
commit 232df2b60a
Signed by: pcfreak30
GPG Key ID: C997C339BE476FF2
2 changed files with 15 additions and 24 deletions

View File

@ -7,10 +7,10 @@ import {
import { S5Client } from "@lumeweb/s5-js"; import { S5Client } from "@lumeweb/s5-js";
import xml2js from "xml2js"; import xml2js from "xml2js";
import { prisma } from "@/lib/prisma"; import { prisma } from "@/lib/prisma";
import * as cheerio from "cheerio";
import slugify from "slugify"; import slugify from "slugify";
import path from "path"; import path from "path";
import { getAvailableSites } from "@/utils.js"; import { getAvailableSites } from "@/utils.js";
import { CID } from "@lumeweb/libs5";
// Action function for POST requests // Action function for POST requests
export async function action({ request }: ActionFunctionArgs) { export async function action({ request }: ActionFunctionArgs) {
@ -34,15 +34,20 @@ export async function action({ request }: ActionFunctionArgs) {
}; };
}; };
if (!("sitemap.xml" in paths)) { // Check if the RSS feed path exists in the paths
throw new Response("Sitemap not found", { status: 404 }); if (!(siteInfo.rss in paths)) {
throw new Response("RSS feed not found", { status: 404 });
} }
const sitemapData = await client.downloadData(paths[siteInfo.sitemap].cid); // Download and parse the RSS feed
const sitemap = await xml2js.parseStringPromise(sitemapData); const rssData = await client.downloadData(paths[siteInfo.rss].cid);
const rss = await xml2js.parseStringPromise(rssData);
// Process each item in the RSS feed
for (const item of rss.rss.channel[0].item) {
const url = item.link[0];
const title = item.title[0]; // Title is directly available from the feed
const urls = sitemap.urlset.url.map((urlEntry: any) => {
const url = urlEntry.loc[0];
let pathname = new URL(url).pathname; let pathname = new URL(url).pathname;
// Normalize and remove leading and trailing slashes from the path // Normalize and remove leading and trailing slashes from the path
@ -50,7 +55,6 @@ export async function action({ request }: ActionFunctionArgs) {
// Function to determine if a URL path represents a directory // Function to determine if a URL path represents a directory
const isDirectory = (pathname: string) => { const isDirectory = (pathname: string) => {
// Check if the path directly maps to a file in the paths object
return !paths.hasOwnProperty(pathname); return !paths.hasOwnProperty(pathname);
}; };
@ -65,35 +69,22 @@ export async function action({ request }: ActionFunctionArgs) {
} }
} }
// Fetch cid after confirming the final path
const cid = paths[pathname]?.cid; const cid = paths[pathname]?.cid;
return { url, cid, path: pathname }; // including cid in return object after final path is determined
});
for (const { url, cid } of urls) {
if (cid) { if (cid) {
const exists = await prisma.article.findUnique({ const exists = await prisma.article.findUnique({
where: { cid }, where: { cid },
}); });
if (!exists) { if (!exists) {
// Fetch and parse the content using CID
const contentData = Buffer.from(
await client.downloadData(cid)
).toString();
const $ = cheerio.load(contentData);
const title = $("title").text(); // Extract the title from the content
const record = { const record = {
title, title,
url, url,
cid: cid, cid: CID.decode(cid).toString(),
createdAt: new Date(), createdAt: new Date(),
updatedAt: new Date(), updatedAt: new Date(),
slug: slugify(new URL(url).pathname), slug: slugify(new URL(url).pathname),
siteKey: slugify(data.site as string), siteKey: data.site,
}; };
// Insert a new record into the database // Insert a new record into the database

2
app/types.d.ts vendored
View File

@ -14,6 +14,6 @@ export type SelectOptions = {
export type SiteList = { export type SiteList = {
[domain: string]: { [domain: string]: {
name: string; name: string;
sitemap: string; rss: string;
}; };
}; };