April, 2024
2024-04-04
- Work on CGSpace duplicate DOIs more
Documenting day-to-day work on the CGSpace repository.
dspace make-handle-config
script and diffed it with the one from DSpace 6$ for file in dspace.log.2023-11-[23]*; do echo "$file"; grep -a -oE 'session_id=[A-Z0-9]{32}' "$file" | sort | uniq | wc -l; done
dspace.log.2023-11-20
22865
dspace.log.2023-11-21
20296
dspace.log.2023-11-22
19688
dspace.log.2023-11-23
17906
dspace.log.2023-11-24
18453
dspace.log.2023-11-25
17513
dspace.log.2023-11-26
19037
dspace.log.2023-11-27
21103
dspace.log.2023-11-28
23023
dspace.log.2023-11-29
23545
dspace.log.2023-11-30
21298
# awk '{print $1}' /var/log/nginx/{access,library-access,oai,rest}.log.1 | sort | uniq | wc -l
17023
# awk '{print $1}' /var/log/nginx/{access,library-access,oai,rest}.log.2.gz | sort | uniq | wc -l
17294
# awk '{print $1}' /var/log/nginx/{access,library-access,oai,rest}.log.3.gz | sort | uniq | wc -l
22057
# awk '{print $1}' /var/log/nginx/{access,library-access,oai,rest}.log.4.gz | sort | uniq | wc -l
32956
# awk '{print $1}' /var/log/nginx/{access,library-access,oai,rest}.log.5.gz | sort | uniq | wc -l
11415
# awk '{print $1}' /var/log/nginx/{access,library-access,oai,rest}.log.6.gz | sort | uniq | wc -l
15444
# awk '{print $1}' /var/log/nginx/{access,library-access,oai,rest}.log.7.gz | sort | uniq | wc -l
12648
$ chrt -b 0 ./run.sh -s http://localhost:8081/solr/statistics -a export -o /tmp/stats-export.json -f 'owningItem:b5862bfa-9799-4167-b1cf-76f0f4ea1e18' -k uid
city:com*
but ended up finding many that have missing bundles, container bitstreams, etc:city:com* AND -bundleName:[* TO *] AND -containerBitstream:[* TO *] AND -file_id:[* TO *] AND -owningItem:[* TO *] AND -version_id:[* TO *]
fix_maxmind_stats.py
script and fixed 1.6 million records and imported them on CGSpace after testing on DSpace 7 Test$ for network in $(csvcut -c network /tmp/ips.csv | sed 1d | sort -u); do grepcidr $network ~/src/git/rmg-ansible-public/roles/dspace/files/nginx/bot-networks.conf; done
108.128.0.0/13 'bot';
46.137.0.0/16 'bot';
52.208.0.0/13 'bot';
52.48.0.0/13 'bot';
54.194.0.0/15 'bot';
54.216.0.0/14 'bot';
54.220.0.0/15 'bot';
54.228.0.0/15 'bot';
63.32.242.35/32 'bot';
63.32.0.0/14 'bot';
99.80.0.0/15 'bot'
ilri/generate_solr_statistics.py
$ csvcut -c 'cg.contributor.affiliation[en_US]' /tmp/initiatives.csv \
| sed -e 1d -e 's/^"//' -e 's/"$//' -e 's/||/\n/g' -e '/^$/d' \
| sort | uniq -c | sort -hr \
| awk 'BEGIN { FS = "^[[:space:]]+[[:digit:]]+[[:space:]]+" } {print $2}'\
| sed -e '1i cg.contributor.affiliation' -e 's/^\(.*\)$/"\1"/' \
> /tmp/2023-12-08-initiatives-affiliations.csv
localhost/dspace7= ☘ \COPY (SELECT DISTINCT text_value AS "dc.contributor.author", count(*) FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id = 3 GROUP BY "dc.contributor.author" ORDER BY count DESC) to /tmp/2023-12-08-authors.csv WITH CSV HEADER;
COPY 102435
frontend/Dockerfile
before building!2023-12-19 17:49:28,022 ERROR unknown unknown org.dspace.rest.Resource @ Something get wrong. Aborting context in finally statement.
dcterms.subject
in SQL:dspace=# BEGIN;
BEGIN
dspace=*# UPDATE metadatavalue SET text_value=LOWER(text_value) WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=187 AND text_value ~ '[[:upper:]]';
UPDATE 462
dspace=*# COMMIT;
COMMIT
$ curl 'http://localhost:8983/solr/statistics/replication?command=backup'
{
"responseHeader":{
"status":0,
"QTime":26},
"status":"OK"}
# du -sh /var/solr/data/configsets/statistics/data/*
22G /var/solr/data/configsets/statistics/data/index
16G /var/solr/data/configsets/statistics/data/snapshot.20231225074111671
4.0K /var/solr/data/configsets/statistics/data/snapshot_metadata
# du -sh /var/solr/data/configsets/statistics/data/*
22G /var/solr/data/configsets/statistics/data/index
20G /var/solr/data/configsets/statistics/data/snapshot.20231225074111671
4.0K /var/solr/data/configsets/statistics/data/snapshot_metadata
# du -sh /var/solr/data/configsets/statistics/data/*
22G /var/solr/data/configsets/statistics/data/index
21G /var/solr/data/configsets/statistics/data/snapshot.20231225074111671
4.0K /var/solr/data/configsets/statistics/data/snapshot_metadata
# du -sh /var/solr/data/configsets/statistics/data/*
22G /var/solr/data/configsets/statistics/data/index
22G /var/solr/data/configsets/statistics/data/snapshot.20231225074111671
4.0K /var/solr/data/configsets/statistics/data/snapshot_metadata
$ curl http://localhost:8983/solr/statistics/update -H "Content-type: text/xml" --data-binary '<delete><query>*:*</query></delete>'
$ curl http://localhost:8983/solr/statistics/update -H "Content-type: text/xml" --data-binary '<commit />'
$ curl 'http://localhost:8983/solr/statistics/replication?command=restore&name=statistics'
# du -sh /var/solr/data/configsets/statistics/data/*
4.0K /var/solr/data/configsets/statistics/data/index.properties
22G /var/solr/data/configsets/statistics/data/restore.20231225154626463
4.0K /var/solr/data/configsets/statistics/data/snapshot_metadata
22G /var/solr/data/configsets/statistics/data/snapshot.statistics
inurl:mahider.cgiar.org
: 0 results on Google!inurl:mahider.ilri.org
: 2,100 results on Googleinurl:mahider.ilri.org inurl:https
: 2 results on Google (!)inurl:dspace.ilri.org:
1,390 results on Googleinurl:dspace.ilri.org inurl:https
: 0 results on Google (!)crossref_doi_lookup.py
script while running some checks from 22,000 CGSpace DOIslocalhost/dspacetest= ☘ \COPY (SELECT DISTINCT(lower(text_value)) AS "subject" FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id IN (187, 120, 210, 122, 215, 127, 208, 124, 128, 123, 125, 135, 203, 236, 238, 119)) to /tmp/2023-07-07-cgspace-subjects.csv WITH CSV HEADER;
COPY 26443
Time: 2564.851 ms (00:02.565)
$ csvcut -c subject /tmp/2023-07-07-cgspace-subjects.csv | sed '1d' > /tmp/2023-07-07-cgspace-subjects.txt
$ ./ilri/agrovoc_lookup.py -i /tmp/2023-07-07-cgspace-subjects.txt -o /tmp/2023-07-07-cgspace-subjects-results.csv
$ csvgrep -c 'match type' -r '^.+$' ~/Downloads/2023-07-07-cgspace-subjects-resolved.csv | sed 1d | wc -l
12528
dcterms.issued:[2018 TO 2023] AND dcterms.type:"Journal Article" AND (dcterms.subject:flooding OR dcterms.subject:flood OR dcterms.subject:"extreme weather events" OR dcterms.subject:drought OR dcterms.subject:"drought resistance" OR dcterms.subject:"drought tolerance" OR dcterms.subject:"soil salinity" OR dcterms.subject:"pests of plants" OR dcterms.subject:pests OR dcterms.subject:heat OR dcterms.subject:fertilizers OR dcterms.subject:"fertilizer technology" OR dcterms.subject:"rice fields" OR dcterms.subject:"landscape conservation" OR dcterms.subject:"landscape restoration" OR dcterms.subject:livestock)
org.dspace.discovery.SearchServiceException: org.apache.solr.search.SyntaxError: Cannot parse 'dcterms.issued:[2018 TO 2023] AND dcterms.type:"Journal Article" AND (dcterms.subject:flooding OR dcterms.subject:flood OR dcterms.subject:"extreme weather events" OR dcterms.subject:drought OR dcterms.subject:"drought resistance" OR dcterms.subject:"drought tolerance" OR dcterms.subject:"soil salinity" OR dcterms.subject:"pests of plants" OR dcterms.subject:pests OR dcterms.subject:heat OR dcterms.subject:fertilizers OR dcterms.subject:"fertilizer technology" OR dcterms.subject:"rice fields" OR dcterms.subject:livestock OR dcterms.subject:"landscape conservation" OR dcterms.subject:"landscape restoration\"\)': Lexical error at line 1, column 617. Encountered: <EOF> after : "\"landscape restoration\\\"\\)"
resolve_orcids.py
to refresh the names in our database
$ psql < locks-age.sql | less -S
$ psql < locks-age.sql | grep -E " (19|18|17|16|12):" | awk -F"|" '{print $10}' | sort -u | xargs kill
cg.subject.impactPlatform
) to CGSpace$ chrt -b 0 dspace solr-export-statistics -i statistics
$ dspace solr-import-statistics -i statistics
Exception: Error from server at http://localhost:8983/solr/statistics: ERROR: [doc=1a92472e-e39d-4602-9b4d-da022df8f233] unknown field 'containerCommunity'
org.apache.solr.client.solrj.impl.HttpSolrClient$RemoteSolrException: Error from server at http://localhost:8983/solr/statistics: ERROR: [doc=1a92472e-e39d-4602-9b4d-da022df8f233] unknown field 'containerCommunity'
at org.apache.solr.client.solrj.impl.HttpSolrClient.executeMethod(HttpSolrClient.java:681)
at org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:266)
at org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:248)
at org.apache.solr.client.solrj.SolrClient.request(SolrClient.java:1290)
at org.dspace.util.SolrImportExport.importIndex(SolrImportExport.java:465)
at org.dspace.util.SolrImportExport.main(SolrImportExport.java:148)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:568)
at org.dspace.app.launcher.ScriptLauncher.runOneCommand(ScriptLauncher.java:277)
at org.dspace.app.launcher.ScriptLauncher.handleScript(ScriptLauncher.java:133)
at org.dspace.app.launcher.ScriptLauncher.main(ScriptLauncher.java:98)
$ chrt -b 0 ./run.sh -s http://localhost:8081/solr/statistics -a export -o /tmp/statistics-2022.json -f 'time:[2022-01-01T00\:00\:00Z TO 2022-12-31T23\:59\:59Z]' -k uid -S author_mtdt,author_mtdt_search,iso_mtdt_search,iso_mtdt,subject_mtdt,subject_mtdt_search,containerCollection,containerCommunity,containerItem,countryCode_ngram,countryCode_search,cua_version,dateYear,dateYearMonth,geoipcountrycode,geoIpCountryCode,ip_ngram,ip_search,isArchived,isInternal,isWithdrawn,containerBitstream,file_id,referrer_ngram,referrer_search,userAgent_ngram,userAgent_search,version_id,complete_query,complete_query_search,filterquery,ngram_query_search,ngram_simplequery_search,simple_query,simple_query_search,range,rangeDescription,rangeDescription_ngram,rangeDescription_search,range_ngram,range_search,actingGroupId,actorMemberGroupId,bitstreamCount,solr_update_time_stamp,bitstreamId,core_update_run_nb
$ ps auxw | grep -E "(1864132|1659487)"
postgres 1659487 0.0 0.5 3269900 197120 ? Ss Jul25 0:03 postgres: 14/main: cgspace cgspace 127.0.0.1(61648) idle in transaction
postgres 1864132 0.1 0.7 3275704 254528 ? Ss 07:27 0:08 postgres: 14/main: cgspace cgspace 127.0.0.1(36998) idle in transaction
postgres 1880388 0.0 0.0 9208 2432 pts/3 S+ 08:48 0:00 grep -E (1864132|1659487)
select nextval ('public.tasklistitem_seq')
locks-age.sql
script and killed them:$ psql < locks-age.sql | awk -F"|" '/ [[:digit:]][1-9]:[[:digit:]]{2}:[[:digit:]]{2}\./ {print $10}' | sort -u | xargs kill
Exception in thread "main" org.apache.solr.client.solrj.impl.BaseHttpSolrClient$RemoteSolrException: Error from server at http://localhost:8983/solr/statistics: ERROR: [doc=0008a7c1-e552-4a4e-93e4-4d23bf39964b] Error adding field 'workflowItemId'='0812be47-1bfe-45e2-9208-5bf10ee46f81' msg=For input string: "0812be47-1bfe-45e2-9208-5bf10ee46f81"
at org.apache.solr.client.solrj.impl.HttpSolrClient.executeMethod(HttpSolrClient.java:745)
at org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:259)
at org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:240)
at org.apache.solr.client.solrj.SolrRequest.process(SolrRequest.java:234)
at org.apache.solr.client.solrj.SolrClient.add(SolrClient.java:102)
at org.apache.solr.client.solrj.SolrClient.add(SolrClient.java:69)
at org.apache.solr.client.solrj.SolrClient.add(SolrClient.java:82)
at it.damore.solr.importexport.App.insertBatch(App.java:295)
at it.damore.solr.importexport.App.lambda$writeAllDocuments$10(App.java:276)
at it.damore.solr.importexport.BatchCollector.lambda$accumulator$0(BatchCollector.java:71)
at java.base/java.util.stream.ReduceOps$3ReducingSink.accept(ReduceOps.java:169)
at java.base/java.util.Iterator.forEachRemaining(Iterator.java:133)
at java.base/java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1845)
at java.base/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:509)
at java.base/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:499)
at java.base/java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:921)
at java.base/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
at java.base/java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:682)
at it.damore.solr.importexport.App.writeAllDocuments(App.java:252)
at it.damore.solr.importexport.App.main(App.java:150)
workflowItemId
field when exporting
isInternal,workflowItemId,containerCommunity,containerCollection,containerItem,containerBitstream,dateYear,dateYearMonth,filterquery,complete_query,simple_query,complete_query_search,simple_query_search,ngram_query_search,ngram_simplequery_search,text,storage_statistics_type,storage_size,storage_nb_of_bitstreams,name,first_name,last_name,p_communities_id,p_communities_name,p_communities_map,p_group_id,p_group_name,p_group_map,group_id,group_name,group_map,parent_count,bitstreamId,bitstreamCount,actingGroupId,actorMemberGroupId,actingGroupParentId,rangeDescription,range,version_id,file_id,cua_version,core_update_run_nb,orphaned
$ chrt -b 0 ./run.sh -s http://localhost:8081/solr/statistics -a export -o /tmp/statistics-2020.json -f 'time:[2020-01-01T00\:00\:00Z TO 2020-12-31T23\:59\:59Z]' -k uid -S actingGroupId,actingGroupParentId,actorMemberGroupId,author_mtdt,author_mtdt_search,bitstreamCount,bitstreamId,complete_query,complete_query_search,containerBitstream,containerCollection,containerCommunity,containerItem,core_update_run_nb,countryCode_ngram,countryCode_search,cua_version,dateYear,dateYearMonth,file_id,filterquery,first_name,geoipcountrycode,geoIpCountryCode,group_id,group_map,group_name,ip_ngram,ip_search,isArchived,isInternal,iso_mtdt,iso_mtdt_search,isWithdrawn,last_name,name,ngram_query_search,ngram_simplequery_search,orphaned,parent_count,p_communities_id,p_communities_map,p_communities_name,p_group_id,p_group_map,p_group_name,range,rangeDescription,rangeDescription_ngram,rangeDescription_search,range_ngram,range_search,referrer_ngram,referrer_search,simple_query,simple_query_search,solr_update_time_stamp,storage_nb_of_bitstreams,storage_size,storage_statistics_type,subject_mtdt,subject_mtdt_search,text,userAgent_ngram,userAgent_search,version_id,workflowItemId
$ dspace metadata-export -i 10568/115087 -f /tmp/2023-07-28-initiatives.csv
$ csvcut -c 'cg.contributor.affiliation[en_US]' ~/Downloads/2023-07-28-initiatives.csv \
| sed -e 1d -e 's/^"//' -e 's/"$//' -e 's/||/\n/g' -e '/^$/d' \
| sort | uniq -c | sort -hr \
| awk 'BEGIN { FS = "^[[:space:]]+[[:digit:]]+[[:space:]]+" } {print $2}'\
| sed -e '1i cg.contributor.affiliation' -e 's/^\(.*\)$/"\1"/' \
> /tmp/2023-07-28-initiatives-affiliations.csv
$ psql < locks-age.sql | awk -F"|" '$9 ~ / [[:digit:]][1-9]:[[:digit:]]{2}:[[:digit:]]{2}\./ {print $10}' | sort -u | xargs kill
11:30:48.598436
in the age column (not 00:00:00) and kills them