feat: use bulk Spotify API calls & inserts to make spotify crawling more efficient (#271)

* feat(spotify-api): bulk endpoints
* feat(music-library): allow bulk operations
* feat(spotify): bulk track+album+artist+genre import
* feat(spotify): use bulk import api for user crawl
* feat(spotify): bulk listen insert

For the benchmark case of a new user where Listory imports 50 new listens along with all now tracks, artists, albums & genres we significantly reduced the number of things happening:

    Spotify API Requests: 208 => 8
    DB Insert: 96 => 8
    Tracing Spans: 1953 => 66
This commit is contained in:
Julian Tölle 2023-05-07 02:20:43 +02:00 committed by GitHub
parent 24b7308343
commit 8721fd101d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 427 additions and 23 deletions

14
package-lock.json generated
View file

@ -45,6 +45,7 @@
"class-validator": "0.14.0",
"cookie-parser": "1.4.6",
"date-fns": "2.30.0",
"lodash": "^4.17.21",
"nest-raven": "9.2.0",
"nestjs-otel": "5.1.2",
"nestjs-pino": "3.2.0",
@ -70,6 +71,7 @@
"@types/express": "4.17.17",
"@types/hapi__joi": "17.1.9",
"@types/jest": "29.5.1",
"@types/lodash": "^4.14.194",
"@types/long": "4.0.2",
"@types/node": "18.16.5",
"@types/passport-http-bearer": "^1.0.37",
@ -3794,6 +3796,12 @@
"@types/koa": "*"
}
},
"node_modules/@types/lodash": {
"version": "4.14.194",
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.194.tgz",
"integrity": "sha512-r22s9tAS7imvBt2lyHC9B8AGwWnXaYb1tY09oyLkXDs4vArpYJzw09nj8MLx5VfciBPGIb+ZwG0ssYnEPJxn/g==",
"dev": true
},
"node_modules/@types/long": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz",
@ -15724,6 +15732,12 @@
"@types/koa": "*"
}
},
"@types/lodash": {
"version": "4.14.194",
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.194.tgz",
"integrity": "sha512-r22s9tAS7imvBt2lyHC9B8AGwWnXaYb1tY09oyLkXDs4vArpYJzw09nj8MLx5VfciBPGIb+ZwG0ssYnEPJxn/g==",
"dev": true
},
"@types/long": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz",

View file

@ -62,6 +62,7 @@
"class-validator": "0.14.0",
"cookie-parser": "1.4.6",
"date-fns": "2.30.0",
"lodash": "^4.17.21",
"nest-raven": "9.2.0",
"nestjs-otel": "5.1.2",
"nestjs-pino": "3.2.0",
@ -87,6 +88,7 @@
"@types/express": "4.17.17",
"@types/hapi__joi": "17.1.9",
"@types/jest": "29.5.1",
"@types/lodash": "^4.14.194",
"@types/long": "4.0.2",
"@types/node": "18.16.5",
"@types/passport-http-bearer": "^1.0.37",

View file

@ -108,7 +108,6 @@ export class AuthService {
}
async createApiToken(user: User, description: string): Promise<ApiToken> {
console.log("createApiToken");
const apiToken = this.apiTokenRepository.create();
apiToken.user = user;

View file

@ -14,10 +14,10 @@ export class Listen {
@PrimaryGeneratedColumn("uuid")
id: string;
@ManyToOne(() => Track)
@ManyToOne(() => Track, { eager: true })
track: Track;
@ManyToOne(() => User)
@ManyToOne(() => User, { eager: true })
user: User;
@Column({ type: "timestamp" })

View file

@ -68,4 +68,21 @@ export class ListenRepository extends Repository<Listen> {
isDuplicate: false,
};
}
/**
*
* @param rows
* @returns A list of all new (non-duplicate) listens
*/
async insertsNoConflict(rows: CreateListenRequestDto[]): Promise<Listen[]> {
const result = await this.createQueryBuilder()
.insert()
.values(rows)
.orIgnore()
.execute();
return this.findBy(
result.identifiers.filter(Boolean).map(({ id }) => ({ id }))
);
}
}

View file

@ -31,6 +31,26 @@ export class ListensService {
return response;
}
async createListens(
listensData: CreateListenRequestDto[]
): Promise<Listen[]> {
const existingListens = await this.listenRepository.findBy(listensData);
const missingListens = listensData.filter(
(newListen) =>
!existingListens.some(
(existingListen) =>
newListen.user.id === existingListen.user.id &&
newListen.track.id === existingListen.track.id &&
newListen.playedAt.getTime() === existingListen.playedAt.getTime()
)
);
return this.listenRepository.save(
missingListens.map((entry) => this.listenRepository.create(entry))
);
}
async getListens(
options: GetListensDto & IPaginationOptions
): Promise<Pagination<Listen>> {

View file

@ -33,6 +33,12 @@ export class MusicLibraryService {
});
}
async findArtists(query: FindArtistDto[]): Promise<Artist[]> {
return this.artistRepository.find({
where: query,
});
}
async getArtistWithOldestUpdate(): Promise<Artist | undefined> {
const [oldestArtist] = await this.artistRepository.find({
take: 1,
@ -69,6 +75,12 @@ export class MusicLibraryService {
return artist;
}
async createArtists(data: CreateArtistDto[]): Promise<Artist[]> {
return this.artistRepository.save(
data.map((entry) => this.artistRepository.create(entry))
);
}
async updateArtist({
artist,
updatedFields,
@ -86,6 +98,10 @@ export class MusicLibraryService {
});
}
async findAlbums(query: FindAlbumDto[]): Promise<Album[]> {
return this.albumRepository.find({ where: query });
}
async createAlbum(data: CreateAlbumDto): Promise<Album> {
const album = this.albumRepository.create();
@ -111,12 +127,24 @@ export class MusicLibraryService {
return album;
}
async createAlbums(data: CreateAlbumDto[]): Promise<Album[]> {
return this.albumRepository.save(
data.map((entry) => this.albumRepository.create(entry))
);
}
async findGenre(query: FindGenreDto): Promise<Genre | undefined> {
return this.genreRepository.findOneBy({
name: query.name,
});
}
async findGenres(query: FindGenreDto[]): Promise<Genre[]> {
return this.genreRepository.find({
where: query,
});
}
async createGenre(data: CreateGenreDto): Promise<Genre> {
const genre = this.genreRepository.create();
@ -140,12 +168,22 @@ export class MusicLibraryService {
return genre;
}
async createGenres(data: CreateGenreDto[]): Promise<Genre[]> {
return this.genreRepository.save(
data.map((entry) => this.genreRepository.create(entry))
);
}
async findTrack(query: FindTrackDto): Promise<Track | undefined> {
return this.trackRepository.findOneBy({
spotify: { id: query.spotify.id },
});
}
async findTracks(query: FindTrackDto[]): Promise<Track[]> {
return this.trackRepository.find({ where: query });
}
async createTrack(data: CreateTrackDto): Promise<Track> {
const track = this.trackRepository.create();
@ -171,4 +209,10 @@ export class MusicLibraryService {
return track;
}
async createTracks(data: CreateTrackDto[]): Promise<Track[]> {
return this.trackRepository.save(
data.map((entry) => this.trackRepository.create(entry))
);
}
}

View file

@ -45,6 +45,22 @@ export class SpotifyApiService {
return artist.data;
}
async getArtists(
accessToken: string,
spotifyIDs: string[]
): Promise<ArtistObject[]> {
const artist = await firstValueFrom(
this.httpService.get<{ artists: ArtistObject[] }>(`v1/artists`, {
headers: { Authorization: `Bearer ${accessToken}` },
params: {
ids: spotifyIDs.join(","),
},
})
);
return artist.data.artists;
}
async getAlbum(accessToken: string, spotifyID: string): Promise<AlbumObject> {
const album = await firstValueFrom(
this.httpService.get<AlbumObject>(`v1/albums/${spotifyID}`, {
@ -54,6 +70,21 @@ export class SpotifyApiService {
return album.data;
}
async getAlbums(
accessToken: string,
spotifyIDs: string[]
): Promise<AlbumObject[]> {
const album = await firstValueFrom(
this.httpService.get<{ albums: AlbumObject[] }>(`v1/albums`, {
headers: { Authorization: `Bearer ${accessToken}` },
params: {
ids: spotifyIDs.join(","),
},
})
);
return album.data.albums;
}
async getTrack(accessToken: string, spotifyID: string): Promise<TrackObject> {
const track = await firstValueFrom(
this.httpService.get<TrackObject>(`v1/tracks/${spotifyID}`, {
@ -63,4 +94,20 @@ export class SpotifyApiService {
return track.data;
}
async getTracks(
accessToken: string,
spotifyIDs: string[]
): Promise<TrackObject[]> {
const track = await firstValueFrom(
this.httpService.get<{ tracks: TrackObject[] }>(`v1/tracks`, {
headers: { Authorization: `Bearer ${accessToken}` },
params: {
ids: spotifyIDs.join(","),
},
})
);
return track.data.tracks;
}
}

View file

@ -1,6 +1,7 @@
import type { Job } from "pg-boss";
import { Injectable, Logger } from "@nestjs/common";
import { chunk, uniq } from "lodash";
import { Span } from "nestjs-otel";
import type { Job } from "pg-boss";
import { ListensService } from "../../listens/listens.service";
import { Album } from "../../music-library/album.entity";
import { Artist } from "../../music-library/artist.entity";
@ -21,6 +22,11 @@ import { TrackObject } from "./spotify-api/entities/track-object";
import { SpotifyApiService } from "./spotify-api/spotify-api.service";
import { SpotifyAuthService } from "./spotify-auth/spotify-auth.service";
/** Number of IDs that can be passed to Spotify Web API "Get Several Artist/Track" calls. */
const SPOTIFY_BULK_MAX_IDS = 50;
/** Number of IDs that can be passed to Spotify Web API "Get Several Album" calls. */
const SPOTIFY_BULK_ALBUMS_MAX_IDS = 20;
@Injectable()
export class SpotifyService {
private readonly logger = new Logger(this.constructor.name);
@ -115,27 +121,27 @@ export class SpotifyService {
return;
}
await Promise.all(
playHistory.map(async (history) => {
const track = await this.importTrack(history.track.id);
const tracks = await this.importTracks(
uniq(playHistory.map((history) => history.track.id))
);
const { isDuplicate } = await this.listensService.createListen({
const listenData = playHistory.map((history) => ({
user,
track,
track: tracks.find((track) => history.track.id === track.spotify.id),
playedAt: new Date(history.played_at),
});
}));
if (!isDuplicate) {
const results = await this.listensService.createListens(listenData);
results.forEach((listen) =>
this.logger.debug(
{ userId: user.id },
`New listen found! ${user.id} listened to "${
track.name
}" by ${track.artists
listen.track.name
}" by ${listen.track.artists
?.map((artist) => `"${artist.name}"`)
.join(", ")}`
);
}
})
)
);
const newestPlayTime = new Date(
@ -223,6 +229,94 @@ export class SpotifyService {
});
}
@Span()
async importTracks(
spotifyIDs: string[],
retryOnExpiredToken: boolean = true
): Promise<Track[]> {
const tracks = await this.musicLibraryService.findTracks(
spotifyIDs.map((id) => ({ spotify: { id } }))
);
// Get missing ids
const missingIDs = spotifyIDs.filter(
(id) => !tracks.some((track) => track.spotify.id === id)
);
// No need to make spotify api request if all data is available locally
if (missingIDs.length === 0) {
return tracks;
}
let spotifyTracks: TrackObject[] = [];
// Split the import requests so we stay within the spotify api limits
try {
await Promise.all(
chunk(missingIDs, SPOTIFY_BULK_MAX_IDS).map(async (ids) => {
const batchTracks = await this.spotifyApi.getTracks(
this.appAccessToken,
ids
);
spotifyTracks.push(...batchTracks);
})
);
} catch (err) {
if (err.response && err.response.status === 401 && retryOnExpiredToken) {
await this.refreshAppAccessToken();
return this.importTracks(spotifyIDs, false);
}
throw err;
}
// We import albums & artist in series because the album import also
// triggers an artist import. In the best case, all artists will already be
// imported by the importArtists() call, and the album call can get them
// from the database.
const artists = await this.importArtists(
uniq(
spotifyTracks.flatMap((track) =>
track.artists.map((artist) => artist.id)
)
)
);
const albums = await this.importAlbums(
uniq(spotifyTracks.map((track) => track.album.id))
);
// Find the right albums & artists for each spotify track & create db entry
const newTracks = await this.musicLibraryService.createTracks(
spotifyTracks.map((spotifyTrack) => {
const trackAlbum = albums.find(
(album) => spotifyTrack.album.id === album.spotify.id
);
const trackArtists = spotifyTrack.artists.map((trackArtist) =>
artists.find((artist) => trackArtist.id == artist.spotify.id)
);
return {
name: spotifyTrack.name,
album: trackAlbum,
artists: trackArtists,
spotify: {
id: spotifyTrack.id,
uri: spotifyTrack.uri,
type: spotifyTrack.type,
href: spotifyTrack.href,
},
};
})
);
// Return new & existing tracks
return [...tracks, ...newTracks];
}
@Span()
async importAlbum(
spotifyID: string,
@ -270,6 +364,80 @@ export class SpotifyService {
});
}
@Span()
async importAlbums(
spotifyIDs: string[],
retryOnExpiredToken: boolean = true
): Promise<Album[]> {
const albums = await this.musicLibraryService.findAlbums(
spotifyIDs.map((id) => ({ spotify: { id } }))
);
// Get missing ids
const missingIDs = spotifyIDs.filter(
(id) => !albums.some((album) => album.spotify.id === id)
);
// No need to make spotify api request if all data is available locally
if (missingIDs.length === 0) {
return albums;
}
let spotifyAlbums: AlbumObject[] = [];
// Split the import requests so we stay within the spotify api limits
try {
await Promise.all(
chunk(missingIDs, SPOTIFY_BULK_ALBUMS_MAX_IDS).map(async (ids) => {
const batchAlbums = await this.spotifyApi.getAlbums(
this.appAccessToken,
ids
);
spotifyAlbums.push(...batchAlbums);
})
);
} catch (err) {
if (err.response && err.response.status === 401 && retryOnExpiredToken) {
await this.refreshAppAccessToken();
return this.importAlbums(spotifyIDs, false);
}
throw err;
}
const artists = await this.importArtists(
uniq(
spotifyAlbums.flatMap((album) =>
album.artists.map((artist) => artist.id)
)
)
);
// Find the right albums & artists for each spotify track & create db entry
const newAlbums = await this.musicLibraryService.createAlbums(
spotifyAlbums.map((spotifyAlbum) => {
const albumArtists = spotifyAlbum.artists.map((albumArtist) =>
artists.find((artist) => albumArtist.id == artist.spotify.id)
);
return {
name: spotifyAlbum.name,
artists: albumArtists,
spotify: {
id: spotifyAlbum.id,
uri: spotifyAlbum.uri,
type: spotifyAlbum.type,
href: spotifyAlbum.href,
},
};
})
);
return [...albums, ...newAlbums];
}
@Span()
async importArtist(
spotifyID: string,
@ -315,6 +483,76 @@ export class SpotifyService {
});
}
@Span()
async importArtists(
spotifyIDs: string[],
retryOnExpiredToken: boolean = true
): Promise<Artist[]> {
const artists = await this.musicLibraryService.findArtists(
spotifyIDs.map((id) => ({ spotify: { id } }))
);
// Get missing ids
const missingIDs = spotifyIDs.filter(
(id) => !artists.some((artist) => artist.spotify.id === id)
);
// No need to make spotify api request if all data is available locally
if (missingIDs.length === 0) {
return artists;
}
let spotifyArtists: ArtistObject[] = [];
// Split the import requests so we stay within the spotify api limits
try {
await Promise.all(
chunk(missingIDs, SPOTIFY_BULK_MAX_IDS).map(async (ids) => {
const batchArtists = await this.spotifyApi.getArtists(
this.appAccessToken,
ids
);
spotifyArtists.push(...batchArtists);
})
);
} catch (err) {
if (err.response && err.response.status === 401 && retryOnExpiredToken) {
await this.refreshAppAccessToken();
return this.importArtists(spotifyIDs, false);
}
throw err;
}
const genres = await this.importGenres(
uniq(spotifyArtists.flatMap((artist) => artist.genres))
);
// Find the right genres for each spotify artist & create db entry
const newArtists = await this.musicLibraryService.createArtists(
spotifyArtists.map((spotifyArtist) => {
const artistGenres = spotifyArtist.genres.map((artistGenre) =>
genres.find((genre) => artistGenre == genre.name)
);
return {
name: spotifyArtist.name,
genres: artistGenres,
spotify: {
id: spotifyArtist.id,
uri: spotifyArtist.uri,
type: spotifyArtist.type,
href: spotifyArtist.href,
},
};
})
);
return [...artists, ...newArtists];
}
@Span()
async updateArtist(
spotifyID: string,
@ -368,6 +606,29 @@ export class SpotifyService {
});
}
@Span()
async importGenres(names: string[]): Promise<Genre[]> {
const genres = await this.musicLibraryService.findGenres(
names.map((name) => ({ name }))
);
// Get missing genres
const missingGenres = names.filter(
(name) => !genres.some((genre) => genre.name === name)
);
// No need to create genres if all data is available locally
if (missingGenres.length === 0) {
return genres;
}
const newGenres = await this.musicLibraryService.createGenres(
missingGenres.map((name) => ({ name }))
);
return [...genres, ...newGenres];
}
@Span()
private async refreshAppAccessToken(): Promise<void> {
if (!this.appAccessTokenInProgress) {