annotate search/meeting_docs.py @ 9:d1b54241a731

rewrite meeting fetcher
author drewp@bigasterisk.com
date Wed, 10 Jul 2024 12:25:06 -0700
parents
children 13438795d896
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
1 import json
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
2 from pathlib import Path
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
3 from pprint import pprint
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
4 import time
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
5 import requests
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
6 from lxml.html import fromstring
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
7
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
8
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
9 def getMeetingText(meetingUrl) -> list[str]:
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
10 mhtml = requests.get(meetingUrl).text
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
11 el = fromstring(mhtml)
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
12 m = el.cssselect('div#meetingSection')[0]
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
13 for st in m.cssselect('style'):
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
14 st.clear()
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
15 meetingText = [
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
16 chunk for chunk in m.xpath('.//text()', namespaces=el.nsmap)
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
17 if chunk.strip()
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
18 ]
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
19 return meetingText
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
20
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
21
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
22 def gatherMtgs(mtg):
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
23 outDir = Path(f'data/albany/meetingId/{mtg["id"]}')
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
24 outDir.mkdir(parents=True, exist_ok=True)
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
25 outFile = outDir / 'agenda.json'
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
26 if outFile.exists():
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
27 return
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
28 meetingUrl = None
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
29 for doc in mtg.get('documentList', []):
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
30 if doc['templateName'] == 'HTML Agenda Packet':
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
31 tid = doc['templateId']
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
32 meetingUrl = f'https://albanyca.primegov.com/Portal/Meeting?meetingTemplateId={tid}'
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
33
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
34 row = {
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
35 'created': time.time(),
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
36 'mtg': mtg,
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
37 'videoUrl': mtg['videoUrl'],
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
38 'meetingUrl': meetingUrl,
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
39 'phrases': getMeetingText(meetingUrl) if meetingUrl else [],
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
40 }
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
41 outFile.write_text(json.dumps(row, indent=2))
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
42 print(f'wrote {outFile}')
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
43
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
44
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
45 for mtg in (requests.get(
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
46 "https://albanyca.primegov.com/api/v2/PublicPortal/ListArchivedMeetings?year=2024"
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
47 ).json()):
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
48 gatherMtgs(mtg)
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
49
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
50 for mtg in (requests.get(
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
51 "https://albanyca.primegov.com/api/v2/PublicPortal/ListUpcomingMeetings"
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
52 ).json()):
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
53 gatherMtgs(mtg)
d1b54241a731 rewrite meeting fetcher
drewp@bigasterisk.com
parents:
diff changeset
54 break