Skip to content

Commit

Permalink
feat: 🎸 youtube video as datasource
Browse files Browse the repository at this point in the history
  • Loading branch information
OdapX authored and gmpetrov committed Nov 20, 2023
1 parent 8c0dab1 commit abe3e67
Show file tree
Hide file tree
Showing 12 changed files with 421 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ const options: DatsourceOption[] = [
isPremium: true,
icon: 'https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Google_Drive_icon_%282020%29.svg/1024px-Google_Drive_icon_%282020%29.svg.png?20221103153031',
},
{
type: 'youtube_video',
label: 'Youtube',
description:
'Paste a youtube video, playlist or channel and make it your source of knowlege',
disabled: false,
icon: 'https://www.svgrepo.com/show/13671/youtube.svg',
isPremium: true,
},
{
type: 'notion' as any,
label: 'Notion',
Expand Down
67 changes: 67 additions & 0 deletions apps/dashboard/components/DatasourceForms/YoutubeForm.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import React, { useEffect } from 'react';
import { useFormContext } from 'react-hook-form';
import { z } from 'zod';

import Input from '@app/components/Input';

import {
DatasourceSchema,
DatasourceYoutube,
} from '@chaindesk/lib/types/models';
import YoutubeApi from '@chaindesk/lib/youtube-api';
import { DatasourceType } from '@chaindesk/prisma';

import Base from './Base';
import type { DatasourceFormProps } from './types';

type Props = DatasourceFormProps<DatasourceYoutube> & {};

function getDatasourceType(url: string) {
if (url.includes('@') || url.includes('list')) {
return DatasourceType.youtube_bulk;
} else if (url.includes('watch')) {
return DatasourceType.youtube_video;
} else {
return null;
}
}

function Nested() {
const { control, register, setValue, watch } =
useFormContext<DatasourceYoutube>();

const url = watch('config.source_url');

useEffect(() => {
const type = getDatasourceType(url || '');
if (type) {
console.log(type);
setValue('type', type);
}
}, [url]);

return (
<Input
label="Youtube URL (video, playlist or channel)"
helperText="e.g.: https://www.youtube.com/watch?v=Jq_XKf5slVc"
control={control as any}
{...register('config.source_url')}
/>
);
}

export default function YoutubeForm(props: Props) {
const { defaultValues, ...rest } = props;

return (
<Base
schema={DatasourceSchema}
{...rest}
defaultValues={{
...props.defaultValues!,
}}
>
<Nested />
</Base>
);
}
6 changes: 6 additions & 0 deletions apps/dashboard/components/DatasourceForms/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ const NotionForm = dynamic(() => import('./NotionForm'), {
ssr: false,
});

const YoutubeForm = dynamic(() => import('./YoutubeForm'), {
ssr: false,
});

const DatasourceFormsMap = {
[DatasourceType.web_page]: WebPageForm,
[DatasourceType.text]: TextForm,
Expand All @@ -41,6 +45,8 @@ const DatasourceFormsMap = {
[DatasourceType.qa]: QAForm,
[DatasourceType.notion]: NotionForm,
[DatasourceType.notion_page]: NotionForm,
[DatasourceType.youtube_video]: YoutubeForm,
[DatasourceType.youtube_bulk]: YoutubeForm,
} as Record<DatasourceType, any>;

type Props = {
Expand Down
10 changes: 8 additions & 2 deletions packages/lib/loaders/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import { QALoader } from './qa';
import { TextLoader } from './text';
import { WebPageLoader } from './web-page';
import { WebSiteLoader } from './web-site';
import { BulkYoutubesLoader } from './youtube-bulk';
import { YoutubeVideoLoader } from './youtube-video';

export class DatasourceLoader {
datasource: Datasource;
Expand All @@ -29,12 +31,16 @@ export class DatasourceLoader {
[DatasourceType.google_drive_folder]: GoogleDriveFolderLoader,
[DatasourceType.qa]: QALoader,
[DatasourceType.notion_page]: NotionPageLoader,
[DatasourceType.notion]: NotionLoader as any,
[DatasourceType.notion]: NotionLoader,
[DatasourceType.youtube_video]: YoutubeVideoLoader,
[DatasourceType.youtube_bulk]: BulkYoutubesLoader,
};

constructor(datasource: Datasource) {
this.datasource = datasource;
this.manager = new this.loadersMap[this.datasource.type](this.datasource);
this.manager = new this.loadersMap[this.datasource.type](
this.datasource as any
);
this.isGroup = this.manager.isGroup;
}

Expand Down
91 changes: 91 additions & 0 deletions packages/lib/loaders/youtube-bulk.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import axios from 'axios';
import cuid from 'cuid';
import { google } from 'googleapis';
import { YoutubeTranscript } from 'youtube-transcript';

import generateFunId from '@chaindesk/lib/generate-fun-id';
import { AppDocument } from '@chaindesk/lib/types/document';
import { DatasourceSchema } from '@chaindesk/lib/types/models';
import { DatasourceStatus, DatasourceType } from '@chaindesk/prisma';
import { prisma } from '@chaindesk/prisma/client';

import triggerTaskLoadDatasource from '../trigger-task-load-datasource';
import YoutubeApi from '../youtube-api';

import { DatasourceLoaderBase } from './base';

type BulkYoutubeDatasource = Extract<
DatasourceSchema,
{ type: 'youtube_bulk' }
>;

export class BulkYoutubesLoader extends DatasourceLoaderBase<BulkYoutubeDatasource> {
isGroup = true;

async getSize(text: string) {
return 0;
}

async load() {
const url = this.datasource.config['source_url'];

if (!url) {
throw new Error('Fatal: missing or invalid url');
}

const type = YoutubeApi.getYoutubeLinkType(url);

let videos: { id: string; title: string }[] = [];
const Youtube = new YoutubeApi();

switch (type) {
case 'channel':
videos = await Youtube.getVideosForChannel(url);
break;
case 'playlist':
videos = await Youtube.getVideosForPlaylist(url);
break;
case 'unknown':
throw new Error('Invalid youtube Url');
}

await prisma.$transaction(async (tx) => {
let ids: string[] = videos.map(() => cuid());

await tx.appDatasource.createMany({
data: videos.map((video, index) => ({
id: ids[index],
type: DatasourceType.youtube_video,
name: video?.title || `${generateFunId()}`,
config: {
source_url: `https://www.youtube.com/watch?v=${video?.id}`,
},
organizationId: this.datasource?.organizationId!,
datastoreId: this.datasource?.datastoreId,
groupId: this.datasource?.id,
serviceProviderId: this.datasource?.serviceProviderId,
})),
skipDuplicates: true,
});

await tx.appDatasource.update({
where: {
id: this.datasource.id,
},
data: {
status: DatasourceStatus.synched,
},
});

await triggerTaskLoadDatasource(
videos.map((_, index) => ({
organizationId: this.datasource?.organizationId!,
datasourceId: ids[index],
priority: 10,
}))
);
});

return [] as AppDocument[];
}
}
69 changes: 69 additions & 0 deletions packages/lib/loaders/youtube-video.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import axios from 'axios';
import { YoutubeTranscript } from 'youtube-transcript';

import { AppDocument } from '@chaindesk/lib/types/document';
import { DatasourceSchema } from '@chaindesk/lib/types/models';

import { ApiError, ApiErrorType } from '../api-error';
import cleanTextForEmbeddings from '../clean-text-for-embeddings';

import { DatasourceLoaderBase } from './base';

type DatasourceYoutubeVideo = Extract<
DatasourceSchema,
{ type: 'youtube_video' }
>;

type YoutubeTranscriptType = { text: string; duration: number; offset: number };

export class YoutubeVideoLoader extends DatasourceLoaderBase<DatasourceYoutubeVideo> {
async getSize(text: string) {
return 0;
}

async load() {
const url = this.datasource.config['source_url'];

if (!url) {
throw new Error('Fatal: missing youtube url.');
}

let docs = [];
try {
const transcripts: YoutubeTranscriptType[] =
await YoutubeTranscript.fetchTranscript(url);
docs = transcripts.map(({ text, offset }) => {
return new AppDocument<any>({
pageContent: text,
metadata: {
source_url: `${url}&t=${Math.ceil(offset / 1000)}`,
},
});
});
} catch (err) {
docs = [
new AppDocument<any>({
pageContent: 'FAILED: Captions Are Disabled on this Video.',
metadata: {
source_url: url,
},
}),
];
}

return docs.map(({ pageContent, metadata }) => {
return {
pageContent,
metadata: {
...metadata,
datastore_id: this.datasource.datastoreId!,
datasource_id: this.datasource.id,
datasource_name: this.datasource.name,
datasource_type: this.datasource.type,
custom_id: this.datasource?.config?.custom_id,
tags: this.datasource?.config?.tags || [],
},
};
});
}
}
1 change: 1 addition & 0 deletions packages/lib/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"swr": "^2.1.1",
"uuid": "^9.0.0",
"xlsx": "^0.18.5",
"youtube-transcript": "^1.0.6",
"zod": "3.21.4"
},
"devDependencies": {
Expand Down
28 changes: 28 additions & 0 deletions packages/lib/types/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,30 @@ export const DatasourceSchema = z.discriminatedUnion('type', [
source_url: z.string().trim().url(),
}),
}),
DatasourceBaseSchema.extend({
type: z.literal(DatasourceType.youtube_video),
config: DatasourceConfigBaseSchema.extend({
source_url: z
.string()
.trim()
.url()
.refine((url) => url.includes('youtube'), {
message: 'URL must be a YouTube URL',
}),
}),
}),
DatasourceBaseSchema.extend({
type: z.literal(DatasourceType.youtube_bulk),
config: DatasourceConfigBaseSchema.extend({
source_url: z
.string()
.trim()
.url()
.refine((url) => url.includes('youtube'), {
message: 'URL must be a YouTube URL',
}),
}),
}),
DatasourceBaseSchema.extend({
type: z.literal(DatasourceType.file),
file: z.any(),
Expand Down Expand Up @@ -173,6 +197,10 @@ export type DatasourceFile = Extract<DatasourceSchema, { type: 'file' }>;
export type DatasourceText = Extract<DatasourceSchema, { type: 'text' }>;
export type DatasourceWebPage = Extract<DatasourceSchema, { type: 'web_page' }>;
export type DatasourceWebSite = Extract<DatasourceSchema, { type: 'web_site' }>;
export type DatasourceYoutube = Extract<
DatasourceSchema,
{ type: 'youtube_bulk' | 'youtube_video' }
>;
export type DatasourceGoogleDrive = Extract<
DatasourceSchema,
{ type: 'google_drive_file' | 'google_drive_folder' }
Expand Down
Loading

1 comment on commit abe3e67

@vercel
Copy link

@vercel vercel bot commented on abe3e67 Nov 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.